Skip to content
Snippets Groups Projects
Commit 4146e378 authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

normalize vrt on build

parent 99ddd2e3
No related branches found
No related tags found
No related merge requests found
......@@ -4,17 +4,18 @@ from app.models import User, Corpus, CorpusFile
from datetime import datetime
import json
import os
import shutil
def convert(json_db_file, data_dir):
with open(json_db_file, 'r') as f:
json_db = json.loads(f.read())
for json_user in json_db:
if not json_user['confirmed']:
current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}')
continue
user_dir = os.path.join(data_dir, json_user['id'])
user_dir = os.path.join(data_dir, str(json_user['id']))
convert_user(json_user, user_dir)
db.session.commit()
......@@ -42,7 +43,7 @@ def convert_user(json_user, user_dir):
if not json_corpus['files'].values():
current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}')
continue
corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id'])
corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id']))
convert_corpus(json_corpus, user, corpus_dir)
current_app.logger.info('Done')
......@@ -66,12 +67,11 @@ def convert_corpus(json_corpus, user, corpus_dir):
db.session.rollback()
raise Exception('Internal Server Error')
for json_corpus_file in json_corpus['files'].values():
corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id'])
convert_corpus_file(json_corpus_file, corpus, corpus_file_dir)
convert_corpus_file(json_corpus_file, corpus, corpus_dir)
current_app.logger.info('Done')
def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
def convert_corpus_file(json_corpus_file, corpus, corpus_dir):
current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...')
corpus_file = CorpusFile(
corpus=corpus,
......@@ -94,122 +94,15 @@ def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus_file)
try:
convert_vrt(
os.path.join(corpus_file_dir, json_corpus_file['filename']),
shutil.copy2(
os.path.join(corpus_dir, json_corpus_file['filename']),
corpus_file.path
)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise Exception('Internal Server Error')
except:
current_app.logger.warning(
'Can not convert corpus file: '
f'{os.path.join(corpus_dir, json_corpus_file["filename"])}'
' -> '
f'{corpus_file.path}'
)
current_app.logger.info('Done')
def convert_vrt(input_file, output_file):
def check_pos_attribute_order(vrt_lines):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS = [
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
'VERB', 'X'
]
for line in vrt_lines:
if line.startswith('<'):
continue
pos_attrs = line.rstrip('\n').split('\t')
num_pos_attrs = len(pos_attrs)
if num_pos_attrs == 4:
if pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos']
continue
elif num_pos_attrs == 5:
if pos_attrs[2] in SIMPLE_POS_LABELS:
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
elif pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
continue
return None
def check_has_ent_as_s_attr(vrt_lines):
for line in vrt_lines:
if line.startswith('<ent'):
return True
return False
def pos_attrs_to_string_1(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
def pos_attrs_to_string_2(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
with open(input_file) as f:
input_vrt_lines = f.readlines()
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_1
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_2
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
pos_attrs_to_string_function = pos_attrs_to_string_2
else:
raise Exception('Can not handle format')
current_ent = None
output_vrt = ''
for line in input_vrt_lines:
if line.strip() == '':
continue
if line.startswith('<'):
if not has_ent_as_s_attr:
if current_ent is not None:
output_vrt += '</ent>\n'
current_ent = None
if (
line.startswith('<corpus')
or line.startswith('</corpus')
or line.startswith('<nlp')
):
continue
elif line.startswith('<text'):
output_vrt += '<text>\n'
continue
elif line.startswith('<s'):
output_vrt += '<s>\n'
continue
output_vrt += line
continue
pos_attrs = line.rstrip('\n').split('\t')
if not has_ent_as_s_attr:
if pos_attrs[4].lower() in ['null', 'none']:
if current_ent:
output_vrt += '</ent>\n'
current_ent = None
else:
if current_ent is None:
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
elif current_ent != pos_attrs[4]:
output_vrt += '</ent>\n'
current_ent = None
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
output_vrt += pos_attrs_to_string_function(pos_attrs)
with open(output_file, 'w') as f:
f.write(output_vrt)
from flask import current_app
def normalize_vrt_file(input_file, output_file):
def check_pos_attribute_order(vrt_lines):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS = [
'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ',
'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
'VERB', 'X'
]
for line in vrt_lines:
if line.startswith('<'):
continue
pos_attrs = line.rstrip('\n').split('\t')
num_pos_attrs = len(pos_attrs)
if num_pos_attrs == 4:
if pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos']
continue
elif num_pos_attrs == 5:
if pos_attrs[2] in SIMPLE_POS_LABELS:
return ['word', 'lemma', 'simple_pos', 'pos', 'ner']
elif pos_attrs[3] in SIMPLE_POS_LABELS:
return ['word', 'pos', 'lemma', 'simple_pos', 'ner']
continue
return None
def check_has_ent_as_s_attr(vrt_lines):
for line in vrt_lines:
if line.startswith('<ent'):
return True
return False
def pos_attrs_to_string_1(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n'
def pos_attrs_to_string_2(pos_attrs):
return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n'
current_app.logger.info(f'Converting {input_file}...')
with open(input_file) as f:
input_vrt_lines = f.readlines()
pos_attr_order = check_pos_attribute_order(input_vrt_lines)
has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines)
current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]')
current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}')
if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_1
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']:
pos_attrs_to_string_function = pos_attrs_to_string_2
elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']:
pos_attrs_to_string_function = pos_attrs_to_string_2
else:
raise Exception('Can not handle format')
current_ent = None
multi_line_tag_definition = False
output_vrt = ''
for line in input_vrt_lines:
if line.strip() == '':
continue
if line.startswith('<'):
if not has_ent_as_s_attr:
if current_ent is not None:
output_vrt += '</ent>\n'
current_ent = None
if not line.rstrip().endswith('>'):
multi_line_tag_definition = True
if line.startswith('<text'):
output_vrt += '<text>\n'
if line.startswith('</text>'):
output_vrt += '</text>\n'
elif line.startswith('<s'):
output_vrt += '<s>\n'
elif line.startswith('</s>'):
output_vrt += '</s>\n'
elif line.startswith('<ent'):
output_vrt += line
elif line.startswith('</ent>'):
output_vrt += line
continue
if multi_line_tag_definition and line.rstrip().endswith('>'):
multi_line_tag_definition = False
continue
pos_attrs = line.rstrip('\n').split('\t')
if not has_ent_as_s_attr:
if pos_attrs[4].lower() in ['null', 'none']:
if current_ent:
output_vrt += '</ent>\n'
current_ent = None
else:
if current_ent is None:
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
elif current_ent != pos_attrs[4]:
output_vrt += '</ent>\n'
current_ent = None
output_vrt += f'<ent type="{pos_attrs[4]}">\n'
current_ent = pos_attrs[4]
output_vrt += pos_attrs_to_string_function(pos_attrs)
with open(output_file, 'w') as f:
f.write(output_vrt)
from app.converters.vrt import normalize_vrt_file
from datetime import datetime, timedelta
from enum import IntEnum
from flask import current_app, url_for
......@@ -854,7 +855,13 @@ class Corpus(HashidMixin, db.Model):
def build(self):
corpus_element = ET.fromstring('<corpus>\n</corpus>')
for corpus_file in self.files:
element_tree = ET.parse(corpus_file.path)
normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)
text_element = element_tree.getroot()
text_element.set('address', corpus_file.address or 'NULL')
text_element.set('author', corpus_file.author)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment