Skip to content
Snippets Groups Projects
Commit ffed8592 authored by Stephan Porada's avatar Stephan Porada :speech_balloon:
Browse files

Add new CQiWrapper

parent 0392d254
No related branches found
No related tags found
No related merge requests found
from .CQiClient import CQiClient
import multiprocessing
import collections
import socket
class CQiWrapper(CQiClient):
......@@ -33,21 +32,41 @@ class CQiWrapper(CQiClient):
"""
self.ctrl_connect(self.username, self.password)
def create_attribute_strings(self, corpus_name):
self.word_str = corpus_name + '.word'
self.lemma_str = corpus_name + '.lemma'
self.pos_str = corpus_name + '.pos'
self.sem_str = corpus_name + '.sem'
self.entry_str = corpus_name + '.entry'
self.entry_author_str = self.entry_str + '_author'
self.entry_title_str = self.entry_str + '_title'
self.attributes = [self.word_str,
self.lemma_str,
self.pos_str,
self.sem_str,
self.entry_str,
self.entry_author_str,
self.entry_title_str]
def create_attribute_strings(self):
p_attrs = self.corpus_positional_attributes(self.corpus_name)
struct_attrs = self.corpus_structural_attributes(self.corpus_name)
self.meta_struct_element = struct_attrs[0]
print(p_attrs)
print(struct_attrs)
self.attr_strings = {}
self.attr_strings['positional_attrs'] = {}
self.attr_strings['struct_attrs'] = {}
for p_attr in p_attrs:
self.attr_strings['positional_attrs'][p_attr] = (self.corpus_name
+ '.'
+ p_attr)
for struct_attr in struct_attrs[:-1]:
self.attr_strings['struct_attrs'][struct_attr] = (self.corpus_name
+ '.'
+ struct_attr)
# self.word_str = corpus_name + '.word'
# self.lemma_str = corpus_name + '.lemma'
# self.pos_str = corpus_name + '.pos'
# self.sem_str = corpus_name + '.sem'
# self.entry_str = corpus_name + '.entry'
# self.entry_author_str = self.entry_str + '_author'
# self.entry_title_str = self.entry_str + '_title'
# self.attributes = [self.word_str,
# self.lemma_str,
# self.pos_str,
# self.sem_str,
# self.entry_str,
# self.entry_author_str,
# self.entry_title_str]
# print(self.attributes)
def set_corpus_name(self, corpus_name):
self.corpus_name = corpus_name
def disconnect(self):
"""
......@@ -58,7 +77,7 @@ class CQiWrapper(CQiClient):
self.ctrl_bye()
self.connection.close()
def query_subcorpus(self, corpus_name, result_subcorpus_name, query):
def query_subcorpus(self, result_subcorpus_name, query):
"""
Create subcorpus
......@@ -66,13 +85,12 @@ class CQiWrapper(CQiClient):
positions for that query.
Keyword arguments:
corpus_name -- name of the corpus the query will be used on
result_subcorpus_name -- user set name of the subcorpus which holds all
cpos match positions, produced by the query
query -- query written in cqp query language
"""
self.cqp_query(corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (corpus_name
self.cqp_query(self.corpus_name, result_subcorpus_name, query)
self.result_subcorpus_ns = (self.corpus_name
+ ':'
+ result_subcorpus_name)
self.SUBCORPUS_NAMES.append(self.result_subcorpus_ns)
......@@ -80,11 +98,9 @@ class CQiWrapper(CQiClient):
print('Nr of all matches is:', self.nr_matches)
def show_subcorpora(self):
print('Known subcorpora:', self.SUBCORPUS_NAMES)
return self.SUBCORPUS_NAMES
return self.cqp_list_subcorpora(self.corpus_name)
def show_results(self,
corpus_name,
result_start_count=0,
result_max_count=50,
context_len=10,):
......@@ -116,7 +132,6 @@ class CQiWrapper(CQiClient):
])
Keyword arguments:
corpus_name -- name of the parent corpus the subcorpus is part of
result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown.
......@@ -126,8 +141,7 @@ class CQiWrapper(CQiClient):
shown (default 10)
"""
self.context_len = context_len
word_str = corpus_name + '.word'
self.corpus_max_len = self.cl_attribute_size(word_str)
self.corpus_max_len = self.cl_attribute_size(self.attr_strings['positional_attrs']['word'])
if self.nr_matches == 0:
print('Query resulted in 0 matches.')
else:
......@@ -157,7 +171,7 @@ class CQiWrapper(CQiClient):
match = multiprocessing.Process(target=self.__get_matches,
args=(i,
index_pair,
corpus_name,
self.corpus_name,
return_dict))
matches.append(match)
match.start()
......@@ -167,7 +181,25 @@ class CQiWrapper(CQiClient):
ordered_results = collections.OrderedDict()
for key in sorted(return_dict.keys()):
ordered_results[key] = return_dict[key]
print('ORDERED_RESULTS', ordered_results)
return ordered_results
def get_cpos_info(self, cpos, session):
match_dict = {}
for attr_dict in self.attr_strings:
# print(self.attr_strings[attr_dict])
if attr_dict == 'positional_attrs':
for p_attr_key in self.attr_strings[attr_dict].keys():
# print(p_attr_key)
match_str = session.cl_cpos2str(self.attr_strings[attr_dict][p_attr_key], range(cpos[0], cpos[1]))
match_dict[p_attr_key] = match_str
elif attr_dict == 'struct_attrs':
for struct_attr_key in self.attr_strings[attr_dict].keys():
# print(struct_attr_key)
struct_entry = session.cl_cpos2struc(self.attr_strings['struct_attrs'][self.meta_struct_element],
range(cpos[0], cpos[1]))
match_str = session.cl_struc2str(self.attr_strings[attr_dict][struct_attr_key], struct_entry)
match_dict[struct_attr_key] = set(match_str)
return match_dict
def __get_matches(self, i, index_pair, corpus_name, return_dict):
"""
......@@ -183,58 +215,46 @@ class CQiWrapper(CQiClient):
return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc.
"""
print('START:', index_pair[0])
print('END:', index_pair[1])
print('=============================')
# print('START:', index_pair[0])
# print('END:', index_pair[1])
# print('=============================')
index_pair = [index_pair[0], index_pair[1] + 1]
tmp_session = CQiWrapper(username=self.username, password=self.password,
host=self.host, port=self.port)
tmp_session.connect()
tokens = tmp_session.cl_cpos2str(self.word_str,
range(index_pair[0],
index_pair[1] + 1))
lemmas = tmp_session.cl_cpos2str(self.lemma_str,
range(index_pair[0],
index_pair[1] + 1))
pos_tags = tmp_session.cl_cpos2str(self.pos_str,
range(index_pair[0],
index_pair[1] + 1))
sem_tags = tmp_session.cl_cpos2str(self.sem_str,
range(index_pair[0],
index_pair[1] + 1))
struc_entry = tmp_session.cl_cpos2struc(self.entry_str,
range(index_pair[0],
index_pair[1] + 1))
match = self.get_cpos_info(index_pair, tmp_session)
# tokens = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
# range(index_pair[0],
# index_pair[1] + 1))
# lemmas = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['lemma'],
# range(index_pair[0],
# index_pair[1] + 1))
# pos_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['pos'],
# range(index_pair[0],
# index_pair[1] + 1))
# sem_tags = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['sem'],
# range(index_pair[0],
# index_pair[1] + 1))
# struc_entry = tmp_session.cl_cpos2struc(self.attr_strings['struct_attrs']['entry'],
# range(index_pair[0],
# index_pair[1] + 1))
before_index = max([0, index_pair[0] - self.context_len])
after_index = min([self.corpus_max_len,
index_pair[1] + self.context_len])
context_before = tmp_session.cl_cpos2str(self.word_str,
context_before = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(before_index,
index_pair[0]))
context_after = tmp_session.cl_cpos2str(self.word_str,
context_after = tmp_session.cl_cpos2str(self.attr_strings['positional_attrs']['word'],
range(index_pair[1] + 1,
after_index + 1))
entry_titles = tmp_session.cl_struc2str(self.entry_title_str,
struc_entry)
entry_authors = tmp_session.cl_struc2str(self.entry_author_str,
struc_entry)
return_dict[i] = {'tokens': tokens,
'lemmas': lemmas,
'pos_tags': pos_tags,
'sem_tags': sem_tags,
'context_before': context_before,
'context_after': context_after,
'entry_title': entry_titles[0],
'entry_author': entry_authors[0],
'cpos_start': index_pair[0],
'cpos_end': index_pair[1]}
# entry_titles = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_title'],
# struc_entry)
# entry_authors = tmp_session.cl_struc2str(self.attr_strings['struct_attrs']['entry_author'],
# struc_entry)
tmp_dict = {'context_before': context_before,
'context_after': context_after,
'cpos_start': index_pair[0],
'cpos_end': index_pair[1]}
match.update(tmp_dict)
return_dict[i] = match
tmp_session.disconnect()
def get_cpos_info(self, cpos):
match_dict = collections.OrderedDict()
for attribute in self.attributes:
if '.entry' not in attribute:
match_str = self.cl_cpos2str(attribute, range(cpos[0], cpos[1]))
match_dict[attribute] = match_str
else:
continue
print(match_dict)
......@@ -38,9 +38,11 @@ def recv_query(message):
corpus_name = 'CORPUS'
result_subcorpus_name = 'Query-results' # should be set by the user somehow
query = message['query']
analysis_client.create_attribute_strings(corpus_name)
analysis_client.query_subcorpus(corpus_name, result_subcorpus_name, query)
analysis_client.show_results(corpus_name)
analysis_client.set_corpus_name(corpus_name)
analysis_client.create_attribute_strings()
analysis_client.query_subcorpus(result_subcorpus_name, query)
results = analysis_client.show_results()
logger.warning('Query results: {}'.format(str(results)))
def observe_corpus_analysis_connection(app, corpus_id, session_id):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment