Skip to content
Snippets Groups Projects
Commit ffed8592 authored by Stephan Porada's avatar Stephan Porada :speech_balloon:
Browse files

Add new CQiWrapper

parent 0392d254
No related branches found
No related tags found
No related merge requests found
from .CQiClient import CQiClient from .CQiClient import CQiClient
import multiprocessing import multiprocessing
import collections import collections
import socket
class CQiWrapper(CQiClient): class CQiWrapper(CQiClient):
...@@ -33,21 +32,41 @@ class CQiWrapper(CQiClient): ...@@ -33,21 +32,41 @@ class CQiWrapper(CQiClient):
""" """
self.ctrl_connect(self.username, self.password) self.ctrl_connect(self.username, self.password)
def create_attribute_strings(self, corpus_name): def create_attribute_strings(self):
self.word_str = corpus_name + '.word' p_attrs = self.corpus_positional_attributes(self.corpus_name)
self.lemma_str = corpus_name + '.lemma' struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.pos_str = corpus_name + '.pos' self.meta_struct_element = struct_attrs[0]
self.sem_str = corpus_name + '.sem' print(p_attrs)
self.entry_str = corpus_name + '.entry' print(struct_attrs)
self.entry_author_str = self.entry_str + '_author' self.attr_strings = {}
self.entry_title_str = self.entry_str + '_title' self.attr_strings['positional_attrs'] = {}
self.attributes = [self.word_str, self.attr_strings['struct_attrs'] = {}
self.lemma_str, for p_attr in p_attrs:
self.pos_str, self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
self.sem_str, + '.'
self.entry_str, + p_attr)
self.entry_author_str, for struct_attr in struct_attrs[:-1]:
self.entry_title_str] self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
# self.word_str = corpus_name + '.word'
# self.lemma_str = corpus_name + '.lemma'
# self.pos_str = corpus_name + '.pos'
# self.sem_str = corpus_name + '.sem'
# self.entry_str = corpus_name + '.entry'
# self.entry_author_str = self.entry_str + '_author'
# self.entry_title_str = self.entry_str + '_title'
# self.attributes = [self.word_str,
# self.lemma_str,
# self.pos_str,
# self.sem_str,
# self.entry_str,
# self.entry_author_str,
# self.entry_title_str]
# print(self.attributes)
def set_corpus_name(self, corpus_name):
self.corpus_name = corpus_name
def disconnect(self): def disconnect(self):
""" """
...@@ -58,7 +77,7 @@ class CQiWrapper(CQiClient): ...@@ -58,7 +77,7 @@ class CQiWrapper(CQiClient):
self.ctrl_bye() self.ctrl_bye()
self.connection.close() self.connection.close()
def query_subcorpus(self, corpus_name, result_subcorpus_name, query): def query_subcorpus(self, result_subcorpus_name, query):
""" """
Create subcorpus Create subcorpus
...@@ -66,13 +85,12 @@ class CQiWrapper(CQiClient): ...@@ -66,13 +85,12 @@ class CQiWrapper(CQiClient):
positions for that query. positions for that query.
Keyword arguments: Keyword arguments:
corpus_name -- name of the corpus the query will be used on
result_subcorpus_name -- user set name of the subcorpus which holds all result_subcorpus_name -- user set name of the subcorpus which holds all
cpos match positions, produced by the query cpos match positions, produced by the query
query -- query written in cqp query language query -- query written in cqp query language
""" """
self.cqp_query(corpus_name, result_subcorpus_name, query) self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (corpus_name self.result_subcorpus_ns = (self.corpus_name
+ ':' + ':'
+ result_subcorpus_name) + result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns) self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
...@@ -80,11 +98,9 @@ class CQiWrapper(CQiClient): ...@@ -80,11 +98,9 @@ class CQiWrapper(CQiClient):
print('Nr of all matches is:', self.nr_matches) print('Nr of all matches is:', self.nr_matches)
def show_subcorpora(self): def show_subcorpora(self):
print('Known subcorpora:', self.SUBCORPUS_NAMES) return self.cqp_list_subcorpora(self.corpus_name)
return self.SUBCORPUS_NAMES
def show_results(self, def show_results(self,
corpus_name,
result_start_count=0, result_start_count=0,
result_max_count=50, result_max_count=50,
context_len=10,): context_len=10,):
...@@ -116,7 +132,6 @@ class CQiWrapper(CQiClient): ...@@ -116,7 +132,6 @@ class CQiWrapper(CQiClient):
]) ])
Keyword arguments: Keyword arguments:
corpus_name -- name of the parent corpus the subcorpus is part of
result_start_count -- start position of the dumped subcorpus. result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50 (default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown. matches 50 to 100 will be shown.
...@@ -126,8 +141,7 @@ class CQiWrapper(CQiClient): ...@@ -126,8 +141,7 @@ class CQiWrapper(CQiClient):
shown (default 10) shown (default 10)
""" """
self.context_len = context_len self.context_len = context_len
word_str = corpus_name + '.word' self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
self.corpus_max_len = self.cl_attribute_size(word_str)
if self.nr_matches == 0: if self.nr_matches == 0:
print('Query resulted in 0 matches.') print('Query resulted in 0 matches.')
else: else:
...@@ -157,7 +171,7 @@ class CQiWrapper(CQiClient): ...@@ -157,7 +171,7 @@ class CQiWrapper(CQiClient):
match = multiprocessing.Process(target=self.__get_matches, match = multiprocessing.Process(target=self.__get_matches,
args=(i, args=(i,
index_pair, index_pair,
corpus_name, self.corpus_name,
return_dict)) return_dict))
matches.append(match) matches.append(match)
match.start() match.start()
...@@ -167,7 +181,25 @@ class CQiWrapper(CQiClient): ...@@ -167,7 +181,25 @@ class CQiWrapper(CQiClient):
ordered_results = collections.OrderedDict() ordered_results = collections.OrderedDict()
for key in sorted(return_dict.keys()): for key in sorted(return_dict.keys()):
ordered_results[key] = return_dict[key] ordered_results[key] = return_dict[key]
print('ORDERED_RESULTS', ordered_results) return ordered_results
def get_cpos_info(self, cpos, session):
match_dict = {}
for attr_dict in self.attr_strings:
# print(self.attr_strings[attr_dict])
if attr_dict == 'positional_attrs':
for p_attr_key in self.attr_strings[attr_dict].keys():
# print(p_attr_key)
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
match_dict[p_attr_key] = match_str
elif attr_dict == 'struct_attrs':
for struct_attr_key in self.attr_strings[attr_dict].keys():
# print(struct_attr_key)
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
range(cpos[0], cpos[1]))
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
match_dict[struct_attr_key] = set(match_str)
return match_dict
def __get_matches(self, i, index_pair, corpus_name, return_dict): def __get_matches(self, i, index_pair, corpus_name, return_dict):
""" """
...@@ -183,58 +215,46 @@ class CQiWrapper(CQiClient): ...@@ -183,58 +215,46 @@ class CQiWrapper(CQiClient):
return_dict -- dictionary created with manager.dict() that holds the return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc. extracted strings tags etc.
""" """
print('START:', index_pair[0]) # print('START:', index_pair[0])
print('END:', index_pair[1]) # print('END:', index_pair[1])
print('=============================') # print('=============================')
index_pair = [index_pair[0], index_pair[1] + 1]
tmp_session = CQiWrapper(username=self.username, password=self.password, tmp_session = CQiWrapper(username=self.username, password=self.password,
host=self.host, port=self.port) host=self.host, port=self.port)
tmp_session.connect() tmp_session.connect()
tokens = tmp_session.cl_cpos2str(self.word_str, match = self.get_cpos_info(index_pair, tmp_session)
range(index_pair[0], # tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
index_pair[1] + 1)) # range(index_pair[0],
lemmas = tmp_session.cl_cpos2str(self.lemma_str, # index_pair[1] + 1))
range(index_pair[0], # lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
index_pair[1] + 1)) # range(index_pair[0],
pos_tags = tmp_session.cl_cpos2str(self.pos_str, # index_pair[1] + 1))
range(index_pair[0], # pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
index_pair[1] + 1)) # range(index_pair[0],
sem_tags = tmp_session.cl_cpos2str(self.sem_str, # index_pair[1] + 1))
range(index_pair[0], # sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
index_pair[1] + 1)) # range(index_pair[0],
struc_entry = tmp_session.cl_cpos2struc(self.entry_str, # index_pair[1] + 1))
range(index_pair[0], # struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
index_pair[1] + 1)) # range(index_pair[0],
# index_pair[1] + 1))
before_index = max([0, index_pair[0] - self.context_len]) before_index = max([0, index_pair[0] - self.context_len])
after_index = min([self.corpus_max_len, after_index = min([self.corpus_max_len,
index_pair[1] + self.context_len]) index_pair[1] + self.context_len])
context_before = tmp_session.cl_cpos2str(self.word_str, context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(before_index, range(before_index,
index_pair[0])) index_pair[0]))
context_after = tmp_session.cl_cpos2str(self.word_str, context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(index_pair[1] + 1, range(index_pair[1] + 1,
after_index + 1)) after_index + 1))
entry_titles = tmp_session.cl_struc2str(self.entry_title_str, # entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
struc_entry) # struc_entry)
entry_authors = tmp_session.cl_struc2str(self.entry_author_str, # entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
struc_entry) # struc_entry)
return_dict[i] = {'tokens': tokens, tmp_dict = {'context_before': context_before,
'lemmas': lemmas, 'context_after': context_after,
'pos_tags': pos_tags, 'cpos_start': index_pair[0],
'sem_tags': sem_tags, 'cpos_end': index_pair[1]}
'context_before': context_before, match.update(tmp_dict)
'context_after': context_after, return_dict[i] = match
'entry_title': entry_titles[0],
'entry_author': entry_authors[0],
'cpos_start': index_pair[0],
'cpos_end': index_pair[1]}
tmp_session.disconnect() tmp_session.disconnect()
def get_cpos_info(self, cpos):
match_dict = collections.OrderedDict()
for attribute in self.attributes:
if '.entry' not in attribute:
match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
match_dict[attribute] = match_str
else:
continue
print(match_dict)
...@@ -38,9 +38,11 @@ def recv_query(message): ...@@ -38,9 +38,11 @@ def recv_query(message):
corpus_name = 'CORPUS' corpus_name = 'CORPUS'
result_subcorpus_name = 'Query-results' # should be set by the user somehow result_subcorpus_name = 'Query-results' # should be set by the user somehow
query = message['query'] query = message['query']
analysis_client.create_attribute_strings(corpus_name) analysis_client.set_corpus_name(corpus_name)
analysis_client.query_subcorpus(corpus_name, result_subcorpus_name, query) analysis_client.create_attribute_strings()
analysis_client.show_results(corpus_name) analysis_client.query_subcorpus(result_subcorpus_name, query)
results = analysis_client.show_results()
logger.warning('Query results: {}'.format(str(results)))
def observe_corpus_analysis_connection(app, corpus_id, session_id): def observe_corpus_analysis_connection(app, corpus_id, session_id):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment