Skip to content
Snippets Groups Projects
Commit 972f514e authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

Implementation of visdata v2

parent e6d8d72e
No related branches found
No related tags found
No related merge requests found
......@@ -49,62 +49,131 @@ def cqi_corpora_corpus_update_db(cqi_client: cqi.CQiClient, corpus_name: str):
@socketio_login_required
@cqi_over_socketio
def cqi_corpora_corpus_get_visualization_data(cqi_client: cqi.CQiClient, corpus_name: str):
cqi_corpus = cqi_client.corpora.get(corpus_name)
corpus = cqi_client.corpora.get(corpus_name)
text = corpus.structural_attributes.get('text')
s = corpus.structural_attributes.get('s')
ent = corpus.structural_attributes.get('ent')
word = corpus.positional_attributes.get('word')
lemma = corpus.positional_attributes.get('lemma')
pos = corpus.positional_attributes.get('pos')
simple_pos = corpus.positional_attributes.get('simple_pos')
payload = {}
payload['num_tokens'] = cqi_corpus.size
cqi_word_attr = cqi_corpus.positional_attributes.get('word')
payload['num_unique_words'] = cqi_word_attr.lexicon_size
payload['word_freqs'] = dict(zip(cqi_word_attr.values_by_ids(list(range(0, cqi_word_attr.lexicon_size))), cqi_word_attr.freqs_by_ids(list(range(0, cqi_word_attr.lexicon_size)))))
# payload['word_freqs'].sort(key=lambda a: a[1], reverse=True)
# payload['word_freqs'] = {k: v for k, v in payload['word_freqs']}
cqi_lemma_attr = cqi_corpus.positional_attributes.get('lemma')
payload['num_unique_lemmas'] = cqi_lemma_attr.lexicon_size
payload['lemma_freqs'] = dict(zip(cqi_lemma_attr.values_by_ids(list(range(0, cqi_lemma_attr.lexicon_size))), cqi_lemma_attr.freqs_by_ids(list(range(0, cqi_lemma_attr.lexicon_size)))))
# payload['lemma_freqs'].sort(key=lambda a: a[1], reverse=True)
# payload['lemma_freqs'] = {k: v for k, v in payload['lemma_freqs']}
cqi_s_attr = cqi_corpus.structural_attributes.get('s')
payload['num_sentences'] = cqi_s_attr.size
# assuming all tokens are in a sentence
payload['average_sentence_length'] = payload['num_tokens'] / payload['num_sentences'] if payload['num_sentences'] != 0 else 0
# payload['average_sentence_length'] = 0
# for s_id in range(0, cqi_s_attr.size):
# s_lbound, s_rbound = cqi_s_attr.cpos_by_id(s_id)
# payload['average_sentence_length'] += s_rbound - s_lbound + 1
# payload['average_sentence_length'] /= payload['num_sentences']
cqi_ent_type_attr = cqi_corpus.structural_attributes.get('ent_type')
payload['num_ent_types'] = cqi_ent_type_attr.size
payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(list(range(0, cqi_ent_type_attr.size)))))
payload['num_unique_ent_types'] = len(payload['ent_type_freqs'])
payload['texts'] = []
cqi_text_attr = cqi_corpus.structural_attributes.get('text')
for text_id in range(0, cqi_text_attr.size):
text_lbound, text_rbound = cqi_text_attr.cpos_by_id(text_id)
text_cpos_list = list(range(text_lbound, text_rbound + 1))
text_payload = {}
text_payload['num_tokens'] = text_rbound - text_lbound + 1
text_word_ids = cqi_word_attr.ids_by_cpos(text_cpos_list)
print(text_word_ids)
text_payload['num_unique_words'] = len(set(text_word_ids))
text_payload['word_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_word_ids)))
text_lemma_ids = cqi_lemma_attr.ids_by_cpos(text_cpos_list)
text_payload['num_unique_lemmas'] = len(set(text_lemma_ids))
text_payload['lemma_freqs'] = dict(Counter(cqi_word_attr.values_by_ids(text_lemma_ids)))
text_s_attr_ids = list(filter(lambda x: x != -1, cqi_s_attr.ids_by_cpos(text_cpos_list)))
text_payload['num_sentences'] = len(set(text_s_attr_ids))
# assuming all tokens are in a sentence
text_payload['average_sentence_length'] = text_payload['num_tokens'] / text_payload['num_sentences'] if text_payload['num_sentences'] != 0 else 0
# text_payload['average_sentence_length'] = 0
# for text_s_id in range(0, cqi_s_attr.size):
# text_s_lbound, text_s_rbound = cqi_s_attr.cpos_by_id(text_s_id)
# text_payload['average_sentence_length'] += text_s_rbound - text_s_lbound + 1
# text_payload['average_sentence_length'] /= text_payload['num_sentences']
text_ent_type_ids = list(filter(lambda x: x != -1, cqi_ent_type_attr.ids_by_cpos(text_cpos_list)))
text_payload['num_ent_types'] = len(set(text_ent_type_ids))
text_payload['ent_type_freqs'] = dict(Counter(cqi_ent_type_attr.values_by_ids(text_ent_type_ids)))
text_payload['num_unique_ent_types'] = len(text_payload['ent_type_freqs'])
for text_sub_attr in cqi_corpus.structural_attributes.list(filters={'part_of': cqi_text_attr}):
text_payload[text_sub_attr.name[(len(cqi_text_attr.name) + 1):]] = text_sub_attr.values_by_ids([text_id])[0]
payload['texts'].append(text_payload)
payload['corpus'] = {'lexicon': {}, 'values': []}
payload['corpus']['lexicon'][0] = {
'bounds': [0, corpus.size - 1],
'counts': {
'text': text.size,
's': s.size,
'ent': ent.size,
'token': corpus.size
},
'freqs': {
'word': dict(
zip(
range(0, word.lexicon_size),
word.freqs_by_ids(list(range(0, word.lexicon_size)))
)
),
'lemma': dict(
zip(
range(0, lemma.lexicon_size),
lemma.freqs_by_ids(list(range(0, lemma.lexicon_size)))
)
),
'pos': dict(
zip(
range(0, pos.lexicon_size),
pos.freqs_by_ids(list(range(0, pos.lexicon_size)))
)
),
'simple_pos': dict(
zip(
range(0, simple_pos.lexicon_size),
simple_pos.freqs_by_ids(list(range(0, simple_pos.lexicon_size)))
)
)
}
}
payload['text'] = {'lexicon': {}, 'values': None}
for text_id in range(0, text.size):
text_lbound, text_rbound = text.cpos_by_id(text_id)
text_cpos_range = range(text_lbound, text_rbound + 1)
text_s_ids = s.ids_by_cpos(list(text_cpos_range))
text_ent_ids = ent.ids_by_cpos(list(text_cpos_range))
payload['text']['lexicon'][text_id] = {
'bounds': [text_lbound, text_rbound],
'counts': {
's': len([x for x in text_s_ids if x != -1]),
'ent': len([x for x in text_ent_ids if x != -1]),
'token': text_rbound - text_lbound + 1
},
'freqs': {
'word': dict(
Counter(word.ids_by_cpos(list(text_cpos_range)))
),
'lemma': dict(
Counter(lemma.ids_by_cpos(list(text_cpos_range)))
),
'pos': dict(
Counter(pos.ids_by_cpos(list(text_cpos_range)))
),
'simple_pos': dict(
Counter(simple_pos.ids_by_cpos(list(text_cpos_range)))
)
}
}
payload['text']['values'] = [
sub_attr.name[(len(text.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': text})
]
payload['s'] = {'lexicon': {}, 'values': None}
for s_id in range(0, s.size):
payload['s']['lexicon'][s_id] = {
# 'bounds': s.cpos_by_id(s_id)
}
payload['s']['values'] = [
sub_attr.name[(len(s.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': s})
]
payload['ent'] = {'lexicon': {}, 'values': None}
for ent_id in range(0, ent.size):
payload['ent']['lexicon'][ent_id] = {
# 'bounds': ent.cpos_by_id(ent_id)
}
payload['ent']['values'] = [
sub_attr.name[(len(ent.name) + 1):]
for sub_attr in corpus.structural_attributes.list(filters={'part_of': ent})
]
payload['lookups'] = {
'corpus': {},
'text': {},
's': {},
'ent': {},
'word': dict(
zip(
range(0, word.lexicon_size),
word.values_by_ids(list(range(0, word.lexicon_size)))
)
),
'lemma': dict(
zip(
range(0, lemma.lexicon_size),
lemma.values_by_ids(list(range(0, lemma.lexicon_size)))
)
),
'pos': dict(
zip(
range(0, pos.lexicon_size),
pos.values_by_ids(list(range(0, pos.lexicon_size)))
)
),
'simple_pos': dict(
zip(
range(0, simple_pos.lexicon_size),
simple_pos.values_by_ids(list(range(0, simple_pos.lexicon_size)))
)
)
}
# print(payload)
return {'code': 200, 'msg': 'OK', 'payload': payload}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment