Skip to content
Snippets Groups Projects
Commit 688b96ff authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

remove debug messages and increase chunk size in cqi

parent a9973e9c
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,6 @@ import gzip
import json
import math
import os
import shutil
from app import db
from app.models import Corpus
from .utils import lookups_by_cpos, partial_export_subcorpus, export_subcorpus
......@@ -67,12 +66,12 @@ def ext_corpus_static_data(corpus: str) -> Dict:
}
for p_attr in cqi_p_attrs.values():
static_corpus_data['corpus']['freqs'][p_attr.name] = {}
chunk_size = 5000
chunk_size = 10000
p_attr_id_list = list(range(p_attr.lexicon_size))
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list
for chunk in chunks:
print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
# print(f'corpus.freqs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['corpus']['freqs'][p_attr.name].update(
dict(zip(chunk, p_attr.freqs_by_ids(chunk)))
)
......@@ -82,7 +81,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [cpos_list[i:i+chunk_size] for i in range(0, len(cpos_list), chunk_size)]
del cpos_list
for chunk in chunks:
print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
# print(f'p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.ids_by_cpos(chunk)))
)
......@@ -92,7 +91,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
chunks = [p_attr_id_list[i:i+chunk_size] for i in range(0, len(p_attr_id_list), chunk_size)]
del p_attr_id_list
for chunk in chunks:
print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
# print(f'values.p_attrs.{p_attr.name}: {chunk[0]} - {chunk[-1]}')
static_corpus_data['values']['p_attrs'][p_attr.name].update(
dict(zip(chunk, p_attr.values_by_ids(chunk)))
)
......@@ -123,7 +122,7 @@ def ext_corpus_static_data(corpus: str) -> Dict:
# static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
# cqi_subcorpus.drop()
for id in range(0, s_attr.size):
print(f's_attrs.{s_attr.name}.lexicon.{id}')
# print(f's_attrs.{s_attr.name}.lexicon.{id}')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {
'bounds': None,
'counts': None,
......@@ -132,9 +131,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
if s_attr.name != 'text':
continue
lbound, rbound = s_attr.cpos_by_id(id)
print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
# print(f's_attrs.{s_attr.name}.lexicon.{id}.bounds')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
# print(f's_attrs.{s_attr.name}.lexicon.{id}.counts')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
cpos_list = list(range(lbound, rbound + 1))
......@@ -142,22 +141,22 @@ def ext_corpus_static_data(corpus: str) -> Dict:
del cpos_list
ent_ids = set()
for chunk in chunks:
print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
# print(f'Gather ent_ids from cpos: {chunk[0]} - {chunk[-1]}')
ent_ids.update({x for x in cqi_s_attrs['ent'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['ent'] = len(ent_ids)
del ent_ids
s_ids = set()
for chunk in chunks:
print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
# print(f'Gather s_ids from cpos: {chunk[0]} - {chunk[-1]}')
s_ids.update({x for x in cqi_s_attrs['s'].ids_by_cpos(chunk) if x != -1})
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['s'] = len(s_ids)
del s_ids
print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
# print(f's_attrs.{s_attr.name}.lexicon.{id}.freqs')
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'] = {}
for p_attr in cqi_p_attrs.values():
p_attr_ids = []
for chunk in chunks:
print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
# print(f'Gather p_attr_ids from cpos: {chunk[0]} - {chunk[-1]}')
p_attr_ids.extend(p_attr.ids_by_cpos(chunk))
static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['freqs'][p_attr.name] = dict(Counter(p_attr_ids))
del p_attr_ids
......@@ -178,9 +177,9 @@ def ext_corpus_static_data(corpus: str) -> Dict:
sub_s_attr_values.append(tmp)
del tmp
del chunks
print(f's_attrs.{s_attr.name}.values')
# print(f's_attrs.{s_attr.name}.values')
static_corpus_data['s_attrs'][s_attr.name]['values'] = s_attr_value_names
print(f'values.s_attrs.{s_attr.name}')
# print(f'values.s_attrs.{s_attr.name}')
static_corpus_data['values']['s_attrs'][s_attr.name] = {
s_attr_id: {
s_attr_value_name: sub_s_attr_values[s_attr_value_name_idx][s_attr_id_idx]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment