Skip to content
Snippets Groups Projects
Commit b3d5c15d authored by Stephan Porada's avatar Stephan Porada :speech_balloon:
Browse files

CQiWrapper new data structure

parent 30f60b60
No related branches found
No related tags found
No related merge requests found
from .CQiClient import CQiClient from CQiClient import CQiClient
from .CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND from CQi import CONST_FIELD_MATCH, CONST_FIELD_MATCHEND
import collections
import re import re
from app import logger # only works if imported into opaque web app # from app import logger # only works if imported into opaque web app
class CQiWrapper(CQiClient): class CQiWrapper(CQiClient):
...@@ -55,16 +54,16 @@ class CQiWrapper(CQiClient): ...@@ -55,16 +54,16 @@ class CQiWrapper(CQiClient):
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.' + '.'
+ struct_attr) + struct_attr)
logger.warning(('All positional and ' # logger.warning(('All positional and '
'structural attributes: {}').format(self.attr_strings)) # 'structural attributes: {}').format(self.attr_strings))
def select_corpus(self, corpus_name): def select_corpus(self, corpus_name):
if corpus_name in self.corpus_list_coprora(): if corpus_name in self.corpus_list_coprora():
self.corpus_name = corpus_name self.corpus_name = corpus_name
self.__create_attribute_strings() self.__create_attribute_strings()
logger.warning('{} does exist.'.format(corpus_name)) # logger.warning('{} does exist.'.format(corpus_name))
else: else:
logger.warning('{} does not exist.'.format(corpus_name)) # logger.warning('{} does not exist.'.format(corpus_name))
pass pass
def disconnect(self): def disconnect(self):
...@@ -75,7 +74,7 @@ class CQiWrapper(CQiClient): ...@@ -75,7 +74,7 @@ class CQiWrapper(CQiClient):
""" """
self.ctrl_bye() self.ctrl_bye()
self.connection.close() self.connection.close()
logger.warning('Disconnected from cqp server.') # logger.warning('Disconnected from cqp server.')
def query_subcorpus(self, query, result_subcorpus_name='Query-results'): def query_subcorpus(self, query, result_subcorpus_name='Query-results'):
""" """
...@@ -95,7 +94,7 @@ class CQiWrapper(CQiClient): ...@@ -95,7 +94,7 @@ class CQiWrapper(CQiClient):
+ result_subcorpus_name) + result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus) self.SUBCORPUS_NAMES.append(self.result_subcorpus)
self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus) self.nr_matches = self.cqp_subcorpus_size(self.result_subcorpus)
logger.warning('Nr of all matches is: {}'.format(self.nr_matches)) # logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
def show_subcorpora(self): def show_subcorpora(self):
""" """
...@@ -125,7 +124,7 @@ class CQiWrapper(CQiClient): ...@@ -125,7 +124,7 @@ class CQiWrapper(CQiClient):
) )
self.nr_matches = min(result_len, self.nr_matches) self.nr_matches = min(result_len, self.nr_matches)
if self.nr_matches == 0: if self.nr_matches == 0:
logger.warning('Query resulted in 0 matches.') # logger.warning('Query resulted in 0 matches.')
return None return None
else: else:
# Get match cpos boundries # Get match cpos boundries
...@@ -141,86 +140,49 @@ class CQiWrapper(CQiClient): ...@@ -141,86 +140,49 @@ class CQiWrapper(CQiClient):
0, 0,
self.nr_matches - 1)) self.nr_matches - 1))
# Generate all cpos between boundries including start and end boundries # Generate all cpos between match boundries including start and end boundries.
# Save them as list into on match entry at serial number 'i' # Also generate cpos for left and right context.
ordered_matches = collections.OrderedDict() # Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
for i, match_pair in enumerate(match_boundaries): # Also collect all cpos together in one list for the final request of
ordered_matches[i] = ({'match_cpos': # all cpos informations
list(range(match_pair[0], all_matches = []
match_pair[1] + 1))}) all_cpos = []
# Saves cpos form all match entries into one list for start, end in match_boundaries:
all_cpos_list = [] lc_cpos = list(range(max([0, start - self.context_len]), start))
for key in ordered_matches.keys(): lc = {'lc': lc_cpos}
all_cpos_list += ordered_matches[key]['match_cpos'] match_cpos = list(range(start, end + 1))
match = {'hit': match_cpos}
# Saves all cpos from before and after context into the list: rc_cpos = list(range(end + 1, min([self.corpus_max_len, end + self.context_len + 1])))
# all_context_cpos_list rc = {'rc': rc_cpos}
all_context_cpos_list = [] lc.update(match)
for key in ordered_matches.keys(): lc.update(rc)
cpos_list = ordered_matches[key]['match_cpos'] all_cpos.extend(lc_cpos + match_cpos + rc_cpos)
before_index = max([0, cpos_list[0] - self.context_len]) all_matches.append(lc)
after_index = min([self.corpus_max_len, # print(all_matches)
cpos_list[-1] + self.context_len]) # print(all_cpos)
ordered_matches[key]['left_context_cpos'] = list(range(before_index,
cpos_list[0])) # Get all sentences IDs for all above collected cpos in all_cpos
ordered_matches[key]['right_context_cpos'] = list(range(cpos_list[-1] + 1, s_ids = self.cl_cpos2struc('UTOPIEN.s', all_cpos) # CHANGE to CORPUS.s will always be like this in nopaque
after_index + 1)) # Get all cpos for all sneteces boundries
all_context_cpos_list += ordered_matches[key]['left_context_cpos'] s_lookup = {}
all_context_cpos_list += ordered_matches[key]['right_context_cpos'] for s_id in set(s_ids):
# Combines all_cpos_list with all_context_cpos_list as a sorted set s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id) # CHANGE to CORPUS.s will always be like this in nopaque
all_cpos_list += all_context_cpos_list # print(s_start, s_end)
all_cpos_list = sorted(list(set(all_cpos_list))) s_cpos = range(s_start, s_end)
s_lookup.update({s_id: list(s_cpos)})
# print(list(s_cpos))
all_cpos.extend(s_cpos)
all_cpos = list(set(all_cpos)) # get rid of cpos duplicates
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for # Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list # all cpos entries in all_cpos_list
# Also saves these informations into the ordered_matches dict # Also saves these informations into self.results dict
all_cpos_infos, s_list = self.get_cpos_infos(all_cpos_list) all_cpos_infos, text_lookup = self.get_cpos_infos(all_cpos)
for key in ordered_matches.keys():
# loops over cpos in cpos_list which holds all match cpos self.results = {'matches': all_matches, 'cpos_lookup': all_cpos_infos,
# Replaces one cpos with the corresponding cpos information created 's_lookup': s_lookup, 'text_lookup': text_lookup}
# by self.get_cpos_infos(all_cpos_list) return self.results
cpos_list = ordered_matches[key]['match_cpos'] # print(self.results)
infos = []
for cpos in cpos_list:
info = {cpos: all_cpos_infos.get(cpos)}
infos.append(info)
ordered_matches[key]['match_cpos'] = infos
try:
# loops over cpos in ordered_matches[key]['left_context_cpos']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
before_context_infos = []
for context_before_cpos in ordered_matches[key]['left_context_cpos']:
before_context_info = {context_before_cpos:
all_cpos_infos.get(context_before_cpos)}
before_context_infos.append(before_context_info)
ordered_matches[key]['left_context_cpos'] = before_context_infos
except UnboundLocalError:
logger.warning('Context before cpos list is empty.')
pass
try:
# loops over cpos in ordered_matches[key]['right_context_cpos']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
after_context_infos = []
for context_after_cpos in ordered_matches[key]['right_context_cpos']:
after_context_info = {context_after_cpos:
all_cpos_infos.get(context_after_cpos)}
after_context_infos.append(after_context_info)
ordered_matches[key]['right_context_cpos'] = after_context_infos
except UnboundLocalError:
logger.warning('Context after cpos list is empty.')
pass
sentences = {}
s_list = set(s_list)
for s_id in s_list:
s_start, s_end = self.cl_struc2cpos('CORPUS.s', s_id)
sentence = self.cl_cpos2str('CORPUS.word', range(s_start, s_end + 1))
sentences.update({s_id: re.sub(r' (?=\W)', '', ' '.join(sentence))})
ordered_matches['sentences'] = sentences
return ordered_matches
def get_cpos_infos(self, all_cpos): def get_cpos_infos(self, all_cpos):
''' '''
...@@ -228,25 +190,42 @@ class CQiWrapper(CQiClient): ...@@ -228,25 +190,42 @@ class CQiWrapper(CQiClient):
all cpos entries specified in the parameter all_cpos. all cpos entries specified in the parameter all_cpos.
''' '''
cpos_infos = {} cpos_infos = {}
s_list = [] for p_attr_key in self.attr_strings['positional_attrs'].keys():
for key in self.attr_strings.keys(): match_strs = self.cl_cpos2str(self.attr_strings['positional_attrs'][p_attr_key], all_cpos)
if key == 'positional_attrs': cpos_infos[p_attr_key] = match_strs
for p_attr_key in self.attr_strings[key].keys():
match_strs = self.cl_cpos2str(self.attr_strings[key][p_attr_key], tmp_s_info = []
all_cpos) tmp_text_info = []
cpos_infos[p_attr_key] = match_strs text_lookup = {}
elif key == 'struct_attrs': tmp_dict = {}
for struct_attr_key in self.attr_strings[key].keys(): for struct_attr_key in self.attr_strings['struct_attrs'].keys():
struct_entry = self.cl_cpos2struc(self.attr_strings[key][struct_attr_key], check = self.attr_strings['struct_attrs'][struct_attr_key]
all_cpos) if check == 'UTOPIEN.s':
has_value = self.corpus_structural_attribute_has_values(self.attr_strings[key][struct_attr_key]) struct_ids = self.cl_cpos2struc(check, all_cpos)
if has_value: for id in struct_ids:
match_strs = self.cl_struc2str(self.attr_strings[key][struct_attr_key], struct_entry) tmp_s_info.append({struct_attr_key: id})
elif self.attr_strings[key][struct_attr_key] == 'CORPUS.s': elif check == 'UTOPIEN.entry':
s_list.extend(struct_entry) struct_ids = self.cl_cpos2struc(check, all_cpos)
else: for id in struct_ids:
match_strs = [None for i in struct_entry] tmp_text_info.append({struct_attr_key: id})
cpos_infos[struct_attr_key] = zip(struct_entry, match_strs) else:
struct_ids = struct_ids = self.cl_cpos2struc(check, all_cpos)
struct_values = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_ids)
for value in struct_values:
for id in struct_ids:
tmp_dict.update({id: {struct_attr_key: value}})
print(tmp_dict)
print(text_lookup)
# struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
# has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
# if has_value:
# match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
# elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
# pass
# else:
# match_strs = [None for i in struct_entry]
# cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
tmp_list = [] tmp_list = []
attr_key_list = [] attr_key_list = []
for key in cpos_infos.keys(): for key in cpos_infos.keys():
...@@ -256,4 +235,7 @@ class CQiWrapper(CQiClient): ...@@ -256,4 +235,7 @@ class CQiWrapper(CQiClient):
dict_cpos_infos = {} dict_cpos_infos = {}
for info in joined_cpos_infos: for info in joined_cpos_infos:
dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:])) dict_cpos_infos[info[0]] = dict(zip(attr_key_list, info[1:]))
return dict_cpos_infos, s_list for key, s_id, text_id in zip(dict_cpos_infos.keys(), tmp_s_info, tmp_text_info):
dict_cpos_infos[key].update(s_id)
dict_cpos_infos[key].update(text_id)
return dict_cpos_infos, text_lookup
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment