diff --git a/app/converters/sandpaper.py b/app/converters/sandpaper.py index 3172183e7dd2cd84edc46b0357b54fa9f2303c52..5b258ea210596529b8004dd03ba7600b4990bf1f 100644 --- a/app/converters/sandpaper.py +++ b/app/converters/sandpaper.py @@ -4,17 +4,18 @@ from app.models import User, Corpus, CorpusFile from datetime import datetime import json import os +import shutil def convert(json_db_file, data_dir): with open(json_db_file, 'r') as f: json_db = json.loads(f.read()) - + for json_user in json_db: if not json_user['confirmed']: current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}') continue - user_dir = os.path.join(data_dir, json_user['id']) + user_dir = os.path.join(data_dir, str(json_user['id'])) convert_user(json_user, user_dir) db.session.commit() @@ -42,7 +43,7 @@ def convert_user(json_user, user_dir): if not json_corpus['files'].values(): current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}') continue - corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id']) + corpus_dir = os.path.join(user_dir, 'corpora', str(json_corpus['id'])) convert_corpus(json_corpus, user, corpus_dir) current_app.logger.info('Done') @@ -66,12 +67,11 @@ def convert_corpus(json_corpus, user, corpus_dir): db.session.rollback() raise Exception('Internal Server Error') for json_corpus_file in json_corpus['files'].values(): - corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id']) - convert_corpus_file(json_corpus_file, corpus, corpus_file_dir) + convert_corpus_file(json_corpus_file, corpus, corpus_dir) current_app.logger.info('Done') -def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir): +def convert_corpus_file(json_corpus_file, corpus, corpus_dir): current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...') corpus_file = CorpusFile( corpus=corpus, @@ -94,122 +94,15 @@ def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir): db.session.flush(objects=[corpus_file]) db.session.refresh(corpus_file) try: - convert_vrt( - os.path.join(corpus_file_dir, json_corpus_file['filename']), + shutil.copy2( + os.path.join(corpus_dir, json_corpus_file['filename']), corpus_file.path ) - except OSError as e: - current_app.logger.error(e) - db.session.rollback() - raise Exception('Internal Server Error') + except: + current_app.logger.warning( + 'Can not convert corpus file: ' + f'{os.path.join(corpus_dir, json_corpus_file["filename"])}' + ' -> ' + f'{corpus_file.path}' + ) current_app.logger.info('Done') - - -def convert_vrt(input_file, output_file): - def check_pos_attribute_order(vrt_lines): - # The following orders are possible: - # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' - # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' - # since 27.01.2022: 'word,pos,lemma,simple_pos' - # This Function tries to find out which order we have by looking at the - # number of attributes and the position of the simple_pos attribute - SIMPLE_POS_LABELS = [ - 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', - 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', - 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', - 'VERB', 'X' - ] - for line in vrt_lines: - if line.startswith('<'): - continue - pos_attrs = line.rstrip('\n').split('\t') - num_pos_attrs = len(pos_attrs) - if num_pos_attrs == 4: - if pos_attrs[3] in SIMPLE_POS_LABELS: - return ['word', 'pos', 'lemma', 'simple_pos'] - continue - elif num_pos_attrs == 5: - if pos_attrs[2] in SIMPLE_POS_LABELS: - return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] - elif pos_attrs[3] in SIMPLE_POS_LABELS: - return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] - continue - return None - - - def check_has_ent_as_s_attr(vrt_lines): - for line in vrt_lines: - if line.startswith('<ent'): - return True - return False - - - def pos_attrs_to_string_1(pos_attrs): - return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n' - - - def pos_attrs_to_string_2(pos_attrs): - return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n' - - - with open(input_file) as f: - input_vrt_lines = f.readlines() - - pos_attr_order = check_pos_attribute_order(input_vrt_lines) - has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines) - - print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]') - print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}') - - if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']: - pos_attrs_to_string_function = pos_attrs_to_string_1 - elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']: - pos_attrs_to_string_function = pos_attrs_to_string_2 - elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']: - pos_attrs_to_string_function = pos_attrs_to_string_2 - else: - raise Exception('Can not handle format') - - current_ent = None - output_vrt = '' - for line in input_vrt_lines: - if line.strip() == '': - continue - if line.startswith('<'): - if not has_ent_as_s_attr: - if current_ent is not None: - output_vrt += '</ent>\n' - current_ent = None - if ( - line.startswith('<corpus') - or line.startswith('</corpus') - or line.startswith('<nlp') - ): - continue - elif line.startswith('<text'): - output_vrt += '<text>\n' - continue - elif line.startswith('<s'): - output_vrt += '<s>\n' - continue - output_vrt += line - continue - pos_attrs = line.rstrip('\n').split('\t') - if not has_ent_as_s_attr: - if pos_attrs[4].lower() in ['null', 'none']: - if current_ent: - output_vrt += '</ent>\n' - current_ent = None - else: - if current_ent is None: - output_vrt += f'<ent type="{pos_attrs[4]}">\n' - current_ent = pos_attrs[4] - elif current_ent != pos_attrs[4]: - output_vrt += '</ent>\n' - current_ent = None - output_vrt += f'<ent type="{pos_attrs[4]}">\n' - current_ent = pos_attrs[4] - output_vrt += pos_attrs_to_string_function(pos_attrs) - - with open(output_file, 'w') as f: - f.write(output_vrt) diff --git a/app/converters/vrt.py b/app/converters/vrt.py new file mode 100644 index 0000000000000000000000000000000000000000..4c4a613efcd0c1a76cd8ba46f952105f21a789ab --- /dev/null +++ b/app/converters/vrt.py @@ -0,0 +1,117 @@ +from flask import current_app + + +def normalize_vrt_file(input_file, output_file): + def check_pos_attribute_order(vrt_lines): + # The following orders are possible: + # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' + # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' + # since 27.01.2022: 'word,pos,lemma,simple_pos' + # This Function tries to find out which order we have by looking at the + # number of attributes and the position of the simple_pos attribute + SIMPLE_POS_LABELS = [ + 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', + 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', + 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', + 'VERB', 'X' + ] + for line in vrt_lines: + if line.startswith('<'): + continue + pos_attrs = line.rstrip('\n').split('\t') + num_pos_attrs = len(pos_attrs) + if num_pos_attrs == 4: + if pos_attrs[3] in SIMPLE_POS_LABELS: + return ['word', 'pos', 'lemma', 'simple_pos'] + continue + elif num_pos_attrs == 5: + if pos_attrs[2] in SIMPLE_POS_LABELS: + return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] + elif pos_attrs[3] in SIMPLE_POS_LABELS: + return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] + continue + return None + + + def check_has_ent_as_s_attr(vrt_lines): + for line in vrt_lines: + if line.startswith('<ent'): + return True + return False + + + def pos_attrs_to_string_1(pos_attrs): + return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n' + + + def pos_attrs_to_string_2(pos_attrs): + return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n' + + current_app.logger.info(f'Converting {input_file}...') + + with open(input_file) as f: + input_vrt_lines = f.readlines() + + pos_attr_order = check_pos_attribute_order(input_vrt_lines) + has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines) + + current_app.logger.info(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]') + current_app.logger.info(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}') + + if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']: + pos_attrs_to_string_function = pos_attrs_to_string_1 + elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']: + pos_attrs_to_string_function = pos_attrs_to_string_2 + elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']: + pos_attrs_to_string_function = pos_attrs_to_string_2 + else: + raise Exception('Can not handle format') + + current_ent = None + multi_line_tag_definition = False + output_vrt = '' + for line in input_vrt_lines: + if line.strip() == '': + continue + if line.startswith('<'): + if not has_ent_as_s_attr: + if current_ent is not None: + output_vrt += '</ent>\n' + current_ent = None + if not line.rstrip().endswith('>'): + multi_line_tag_definition = True + if line.startswith('<text'): + output_vrt += '<text>\n' + if line.startswith('</text>'): + output_vrt += '</text>\n' + elif line.startswith('<s'): + output_vrt += '<s>\n' + elif line.startswith('</s>'): + output_vrt += '</s>\n' + elif line.startswith('<ent'): + output_vrt += line + elif line.startswith('</ent>'): + output_vrt += line + continue + if multi_line_tag_definition and line.rstrip().endswith('>'): + multi_line_tag_definition = False + continue + pos_attrs = line.rstrip('\n').split('\t') + if not has_ent_as_s_attr: + if pos_attrs[4].lower() in ['null', 'none']: + if current_ent: + output_vrt += '</ent>\n' + current_ent = None + else: + if current_ent is None: + output_vrt += f'<ent type="{pos_attrs[4]}">\n' + current_ent = pos_attrs[4] + elif current_ent != pos_attrs[4]: + output_vrt += '</ent>\n' + current_ent = None + output_vrt += f'<ent type="{pos_attrs[4]}">\n' + current_ent = pos_attrs[4] + output_vrt += pos_attrs_to_string_function(pos_attrs) + + with open(output_file, 'w') as f: + f.write(output_vrt) diff --git a/app/models.py b/app/models.py index 528b11dc94ca1fdb47c65dc1a06a109a47a9bd61..2d1c11ca433aea8a428f465ae111d2372e711c81 100644 --- a/app/models.py +++ b/app/models.py @@ -1,3 +1,4 @@ +from app.converters.vrt import normalize_vrt_file from datetime import datetime, timedelta from enum import IntEnum from flask import current_app, url_for @@ -854,7 +855,13 @@ class Corpus(HashidMixin, db.Model): def build(self): corpus_element = ET.fromstring('<corpus>\n</corpus>') for corpus_file in self.files: - element_tree = ET.parse(corpus_file.path) + normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt') + try: + normalize_vrt_file(corpus_file.path, normalized_vrt_path) + except: + self.status = CorpusStatus.FAILED + return + element_tree = ET.parse(normalized_vrt_path) text_element = element_tree.getroot() text_element.set('address', corpus_file.address or 'NULL') text_element.set('author', corpus_file.author)