Skip to content
Snippets Groups Projects
Commit 972f514e authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

Implementation of visdata v2

parent e6d8d72e
No related branches found
No related tags found
No related merge requests found
...@@ -49,62 +49,131 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str): ...@@ -49,62 +49,131 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str):
@socketio_login_required @socketio_login_required
@cqi_over_socketio @cqi_over_socketio
def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str): def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str):
cqi_corpus = cqi_client.corpora.get(corpus_name) corpus = cqi_client.corpora.get(corpus_name)
text = corpus.structural_attributes.get('text')
s = corpus.structural_attributes.get('s')
ent = corpus.structural_attributes.get('ent')
word = corpus.positional_attributes.get('word')
lemma = corpus.positional_attributes.get('lemma')
pos = corpus.positional_attributes.get('pos')
simple_pos = corpus.positional_attributes.get('simple_pos')
payload = {} payload = {}
payload['num_tokens'] = cqi_corpus.size payload['corpus'] = {'lexicon': {}, 'values': []}
cqi_word_attr = cqi_corpus.positional_attributes.get('word') payload['corpus']['lexicon'][0] = {
payload['num_unique_words'] = cqi_word_attr.lexicon_size 'bounds': [0, corpus.size - 1],
payload['word_freqs'] = dict(zip(cqi_word_attr.values_by_ids(list(range(0, cqi_word_attr.lexicon_size))), cqi_word_attr.freqs_by_ids(list(range(0, cqi_word_attr.lexicon_size))))) 'counts': {
# payload['word_freqs'].sort(key=lambda a: a[1], reverse=True) 'text': text.size,
# payload['word_freqs'] = {k: v for k, v in payload['word_freqs']} 's': s.size,
cqi_lemma_attr = cqi_corpus.positional_attributes.get('lemma') 'ent': ent.size,
payload['num_unique_lemmas'] = cqi_lemma_attr.lexicon_size 'token': corpus.size
payload['lemma_freqs'] = dict(zip(cqi_lemma_attr.values_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))), cqi_lemma_attr.freqs_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))))) },
# payload['lemma_freqs'].sort(key=lambda a: a[1], reverse=True) 'freqs': {
# payload['lemma_freqs'] = {k: v for k, v in payload['lemma_freqs']} 'word': dict(
cqi_s_attr = cqi_corpus.structural_attributes.get('s') zip(
payload['num_sentences'] = cqi_s_attr.size range(0, word.lexicon_size),
# assuming all tokens are in a sentence word.freqs_by_ids(list(range(0, word.lexicon_size)))
payload['average_sentence_length'] = payload['num_tokens'] / payload['num_sentences'] if payload['num_sentences'] != 0 else 0 )
# payload['average_sentence_length'] = 0 ),
# for s_id in range(0, cqi_s_attr.size): 'lemma': dict(
# s_lbound, s_rbound = cqi_s_attr.cpos_by_id(s_id) zip(
# payload['average_sentence_length'] += s_rbound - s_lbound + 1 range(0, lemma.lexicon_size),
# payload['average_sentence_length'] /= payload['num_sentences'] lemma.freqs_by_ids(list(range(0, lemma.lexicon_size)))
cqi_ent_type_attr = cqi_corpus.structural_attributes.get('ent_type') )
payload['num_ent_types'] = cqi_ent_type_attr.size ),
payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(list(range(0, cqi_ent_type_attr.size))))) 'pos': dict(
payload['num_unique_ent_types'] = len(payload['ent_type_freqs']) zip(
payload['texts'] = [] range(0, pos.lexicon_size),
cqi_text_attr = cqi_corpus.structural_attributes.get('text') pos.freqs_by_ids(list(range(0, pos.lexicon_size)))
for text_id in range(0, cqi_text_attr.size): )
text_lbound, text_rbound = cqi_text_attr.cpos_by_id(text_id) ),
text_cpos_list = list(range(text_lbound, text_rbound + 1)) 'simple_pos': dict(
text_payload = {} zip(
text_payload['num_tokens'] = text_rbound - text_lbound + 1 range(0, simple_pos.lexicon_size),
text_word_ids = cqi_word_attr.ids_by_cpos(text_cpos_list) simple_pos.freqs_by_ids(list(range(0, simple_pos.lexicon_size)))
print(text_word_ids) )
text_payload['num_unique_words'] = len(set(text_word_ids)) )
text_payload['word_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_word_ids))) }
text_lemma_ids = cqi_lemma_attr.ids_by_cpos(text_cpos_list) }
text_payload['num_unique_lemmas'] = len(set(text_lemma_ids)) payload['text'] = {'lexicon': {}, 'values': None}
text_payload['lemma_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_lemma_ids))) for text_id in range(0, text.size):
text_s_attr_ids = list(filter(lambda x: x != -1, cqi_s_attr.ids_by_cpos(text_cpos_list))) text_lbound, text_rbound = text.cpos_by_id(text_id)
text_payload['num_sentences'] = len(set(text_s_attr_ids)) text_cpos_range = range(text_lbound, text_rbound + 1)
# assuming all tokens are in a sentence text_s_ids = s.ids_by_cpos(list(text_cpos_range))
text_payload['average_sentence_length'] = text_payload['num_tokens'] / text_payload['num_sentences'] if text_payload['num_sentences'] != 0 else 0 text_ent_ids = ent.ids_by_cpos(list(text_cpos_range))
# text_payload['average_sentence_length'] = 0 payload['text']['lexicon'][text_id] = {
# for text_s_id in range(0, cqi_s_attr.size): 'bounds': [text_lbound, text_rbound],
# text_s_lbound, text_s_rbound = cqi_s_attr.cpos_by_id(text_s_id) 'counts': {
# text_payload['average_sentence_length'] += text_s_rbound - text_s_lbound + 1 's': len([x for x in text_s_ids if x != -1]),
# text_payload['average_sentence_length'] /= text_payload['num_sentences'] 'ent': len([x for x in text_ent_ids if x != -1]),
text_ent_type_ids = list(filter(lambda x: x != -1, cqi_ent_type_attr.ids_by_cpos(text_cpos_list))) 'token': text_rbound - text_lbound + 1
text_payload['num_ent_types'] = len(set(text_ent_type_ids)) },
text_payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(text_ent_type_ids))) 'freqs': {
text_payload['num_unique_ent_types'] = len(text_payload['ent_type_freqs']) 'word': dict(
for text_sub_attr in cqi_corpus.structural_attributes.list(filters={'part_of': cqi_text_attr}): Counter(word.ids_by_cpos(list(text_cpos_range)))
text_payload[text_sub_attr.name[(len(cqi_text_attr.name) + 1):]] = text_sub_attr.values_by_ids([text_id])[0] ),
payload['texts'].append(text_payload) 'lemma': dict(
Counter(lemma.ids_by_cpos(list(text_cpos_range)))
),
'pos': dict(
Counter(pos.ids_by_cpos(list(text_cpos_range)))
),
'simple_pos': dict(
Counter(simple_pos.ids_by_cpos(list(text_cpos_range)))
)
}
}
payload['text']['values'] = [
sub_attr.name[(len(text.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': text})
]
payload['s'] = {'lexicon': {}, 'values': None}
for s_id in range(0, s.size):
payload['s']['lexicon'][s_id] = {
# 'bounds': s.cpos_by_id(s_id)
}
payload['s']['values'] = [
sub_attr.name[(len(s.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': s})
]
payload['ent'] = {'lexicon': {}, 'values': None}
for ent_id in range(0, ent.size):
payload['ent']['lexicon'][ent_id] = {
# 'bounds': ent.cpos_by_id(ent_id)
}
payload['ent']['values'] = [
sub_attr.name[(len(ent.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': ent})
]
payload['lookups'] = {
'corpus': {},
'text': {},
's': {},
'ent': {},
'word': dict(
zip(
range(0, word.lexicon_size),
word.values_by_ids(list(range(0, word.lexicon_size)))
)
),
'lemma': dict(
zip(
range(0, lemma.lexicon_size),
lemma.values_by_ids(list(range(0, lemma.lexicon_size)))
)
),
'pos': dict(
zip(
range(0, pos.lexicon_size),
pos.values_by_ids(list(range(0, pos.lexicon_size)))
)
),
'simple_pos': dict(
zip(
range(0, simple_pos.lexicon_size),
simple_pos.values_by_ids(list(range(0, simple_pos.lexicon_size)))
)
)
}
# print(payload) # print(payload)
return {'code': 200, 'msg': 'OK', 'payload': payload} return {'code': 200, 'msg': 'OK', 'payload': payload}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment