CQiWrapper new data structure

b3d5c15d · Stephan Porada · 30f60b60 · b3d5c15d
Commit b3d5c15d authored 5 years ago by Stephan Porada
--- a/app/corpora/CQiWrapper/CQiWrapper.py
+++ b/app/corpora/CQiWrapper/CQiWrapper.py
-from .CQiClient import CQiClient
-from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
-import collections
+from CQiClient import CQiClient
+from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
 import re
-from app import logger  # only works if imported into opaque web app
+# from app import logger  # only works if imported into opaque web app


 class CQiWrapper(CQiClient):
@@ -55,16 +54,16 @@ class CQiWrapper(CQiClient):
            self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
                                                              + '.'
                                                              + struct_attr)
-        logger.warning(('All positional and '
-                        'structural attributes: {}').format(self.attr_strings))
+        # logger.warning(('All positional and '
+                        # 'structural attributes: {}').format(self.attr_strings))

    def select_corpus(self, corpus_name):
        if corpus_name in self.corpus_list_coprora():
            self.corpus_name = corpus_name
            self.__create_attribute_strings()
-            logger.warning('{} does exist.'.format(corpus_name))
+            # logger.warning('{} does exist.'.format(corpus_name))
        else:
-            logger.warning('{} does not exist.'.format(corpus_name))
+            # logger.warning('{} does not exist.'.format(corpus_name))
            pass

    def disconnect(self):
@@ -75,7 +74,7 @@ class CQiWrapper(CQiClient):
        """
        self.ctrl_bye()
        self.connection.close()
-        logger.warning('Disconnected from cqp server.')
+        # logger.warning('Disconnected from cqp server.')

    def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
        """
@@ -95,7 +94,7 @@ class CQiWrapper(CQiClient):
                                 + result_subcorpus_name)
        self.SUBCORPUS_NAMES.append(self.result_subcorpus)
        self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
-        logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
+        # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))

    def show_subcorpora(self):
        """
@@ -125,7 +124,7 @@ class CQiWrapper(CQiClient):
                              )
        self.nr_matches = min(result_len, self.nr_matches)
        if self.nr_matches == 0:
-            logger.warning('Query resulted in 0 matches.')
+            # logger.warning('Query resulted in 0 matches.')
            return None
        else:
            # Get match cpos boundries
@@ -141,86 +140,49 @@ class CQiWrapper(CQiClient):
                                                           0,
                                                           self.nr_matches - 1))

-        # Generate all cpos between boundries including start and end boundries
-        # Save them as list into on match entry at serial number 'i'
-        ordered_matches = collections.OrderedDict()
-        for i, match_pair in enumerate(match_boundaries):
-            ordered_matches[i] = ({'match_cpos':
-                                   list(range(match_pair[0],
-                                              match_pair[1] + 1))})
-        # Saves cpos form all match entries into one list
-        all_cpos_list = []
-        for key in ordered_matches.keys():
-            all_cpos_list += ordered_matches[key]['match_cpos']
-
-        # Saves all cpos from before and after context into the list:
-        # all_context_cpos_list
-        all_context_cpos_list = []
-        for key in ordered_matches.keys():
-            cpos_list = ordered_matches[key]['match_cpos']
-            before_index = max([0, cpos_list[0] - self.context_len])
-            after_index = min([self.corpus_max_len,
-                               cpos_list[-1] + self.context_len])
-            ordered_matches[key]['left_context_cpos'] = list(range(before_index,
-                                                                          cpos_list[0]))
-            ordered_matches[key]['right_context_cpos'] = list(range(cpos_list[-1] + 1,
-                                                                         after_index + 1))
-            all_context_cpos_list += ordered_matches[key]['left_context_cpos']
-            all_context_cpos_list += ordered_matches[key]['right_context_cpos']
-        # Combines all_cpos_list with all_context_cpos_list as a sorted set
-        all_cpos_list += all_context_cpos_list
-        all_cpos_list = sorted(list(set(all_cpos_list)))
+        # Generate all cpos between match boundries including start and end boundries.
+        # Also generate cpos for left and right context.
+        # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
+        # Also collect all cpos together in one list for the final request of
+        # all cpos informations
+        all_matches = []
+        all_cpos = []
+        for start, end in match_boundaries:
+            lc_cpos = list(range(max([0, start - self.context_len]), start))
+            lc = {'lc': lc_cpos}
+            match_cpos = list(range(start, end + 1))
+            match = {'hit': match_cpos}
+            rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1])))
+            rc = {'rc': rc_cpos}
+            lc.update(match)
+            lc.update(rc)
+            all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
+            all_matches.append(lc)
+        # print(all_matches)
+        # print(all_cpos)
+
+        # Get all sentences IDs for all above collected cpos in all_cpos
+        s_ids = self.cl_cpos2struc('UTOPIEN.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque
+        # Get all cpos for all sneteces boundries
+        s_lookup = {}
+        for s_id in set(s_ids):
+            s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)  # CHANGE to CORPUS.s will always be like this in nopaque
+            # print(s_start, s_end)
+            s_cpos = range(s_start, s_end)
+            s_lookup.update({s_id: list(s_cpos)})
+            # print(list(s_cpos))
+            all_cpos.extend(s_cpos)
+        all_cpos = list(set(all_cpos)) # get rid of cpos duplicates

        # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
        # all cpos entries in all_cpos_list
-        # Also saves these informations into the ordered_matches dict
-        all_cpos_infos, s_list = self.get_cpos_infos(all_cpos_list)
-        for key in ordered_matches.keys():
-            # loops over cpos in cpos_list which holds all match cpos
-            # Replaces one cpos with the corresponding cpos information created
-            # by self.get_cpos_infos(all_cpos_list)
-            cpos_list = ordered_matches[key]['match_cpos']
-            infos = []
-            for cpos in cpos_list:
-                info = {cpos: all_cpos_infos.get(cpos)}
-                infos.append(info)
-            ordered_matches[key]['match_cpos'] = infos
-            try:
-                # loops over cpos in ordered_matches[key]['left_context_cpos']
-                # which holds all cpos of the before context
-                # Replaces one cpos with the corresponding cpos information created
-                # by self.get_cpos_infos(all_cpos_list)
-                before_context_infos = []
-                for context_before_cpos in ordered_matches[key]['left_context_cpos']:
-                    before_context_info = {context_before_cpos:
-                                           all_cpos_infos.get(context_before_cpos)}
-                    before_context_infos.append(before_context_info)
-                ordered_matches[key]['left_context_cpos'] = before_context_infos
-            except UnboundLocalError:
-                logger.warning('Context before cpos list is empty.')
-                pass
-            try:
-                # loops over cpos in ordered_matches[key]['right_context_cpos']
-                # which holds all cpos of the before context
-                # Replaces one cpos with the corresponding cpos information created
-                # by self.get_cpos_infos(all_cpos_list)
-                after_context_infos = []
-                for context_after_cpos in ordered_matches[key]['right_context_cpos']:
-                    after_context_info = {context_after_cpos:
-                                          all_cpos_infos.get(context_after_cpos)}
-                    after_context_infos.append(after_context_info)
-                ordered_matches[key]['right_context_cpos'] = after_context_infos
-            except UnboundLocalError:
-                logger.warning('Context after cpos list is empty.')
-                pass
-        sentences = {}
-        s_list = set(s_list)
-        for s_id in s_list:
-            s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id)
-            sentence = self.cl_cpos2str('CORPUS.word', range(s_start, s_end + 1))
-            sentences.update({s_id: re.sub(r' (?=\W)', '', ' '.join(sentence))})
-        ordered_matches['sentences'] = sentences
-        return ordered_matches
+        # Also saves these informations into self.results dict
+        all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
+
+        self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
+                        's_lookup': s_lookup, 'text_lookup': text_lookup}
+        return self.results
+        # print(self.results)

    def get_cpos_infos(self, all_cpos):
        '''
@@ -228,25 +190,42 @@ class CQiWrapper(CQiClient):
        all cpos entries specified in the parameter all_cpos.
        '''
        cpos_infos = {}
-        s_list = []
-        for key in self.attr_strings.keys():
-            if key == 'positional_attrs':
-                for p_attr_key in self.attr_strings[key].keys():
-                    match_strs = self.cl_cpos2str(self.attr_strings[key][p_attr_key],
-                                                  all_cpos)
-                    cpos_infos[p_attr_key] = match_strs
-            elif key == 'struct_attrs':
-                for struct_attr_key in self.attr_strings[key].keys():
-                    struct_entry = self.cl_cpos2struc(self.attr_strings[key][struct_attr_key],
-                                                      all_cpos)
-                    has_value = self.corpus_structural_attribute_has_values(self.attr_strings[key][struct_attr_key])
-                    if has_value:
-                        match_strs = self.cl_struc2str(self.attr_strings[key][struct_attr_key], struct_entry)
-                    elif self.attr_strings[key][struct_attr_key] == 'CORPUS.s':
-                        s_list.extend(struct_entry)
-                    else:
-                        match_strs = [None for i in struct_entry]
-                    cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
+        for p_attr_key in self.attr_strings['positional_attrs'].keys():
+            match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
+            cpos_infos[p_attr_key] = match_strs
+
+        tmp_s_info = []
+        tmp_text_info = []
+        text_lookup = {}
+        tmp_dict = {}
+        for struct_attr_key in self.attr_strings['struct_attrs'].keys():
+            check = self.attr_strings['struct_attrs'][struct_attr_key]
+            if check == 'UTOPIEN.s':
+                struct_ids = self.cl_cpos2struc(check, all_cpos)
+                for id in struct_ids:
+                    tmp_s_info.append({struct_attr_key: id})
+            elif check == 'UTOPIEN.entry':
+                struct_ids = self.cl_cpos2struc(check, all_cpos)
+                for id in struct_ids:
+                    tmp_text_info.append({struct_attr_key: id})
+            else:
+                struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos)
+                struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids)
+                for value in struct_values:
+                    for id in struct_ids:
+                        tmp_dict.update({id: {struct_attr_key: value}})
+        print(tmp_dict)
+        print(text_lookup)
+
+            # struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
+            # has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
+            # if has_value:
+            #     match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
+            # elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
+            #     pass
+            # else:
+            #     match_strs = [None for i in struct_entry]
+            # cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
        tmp_list = []
        attr_key_list = []
        for key in cpos_infos.keys():
@@ -256,4 +235,7 @@ class CQiWrapper(CQiClient):
        dict_cpos_infos = {}
        for info in joined_cpos_infos:
            dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
-        return dict_cpos_infos, s_list
+        for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info):
+            dict_cpos_infos[key].update(s_id)
+            dict_cpos_infos[key].update(text_id)
+        return dict_cpos_infos, text_lookup