Skip to content
Snippets Groups Projects
Commit 9d4001f4 authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

Reimplement corpus import and activate it again

parent de4a8358
No related branches found
No related tags found
No related merge requests found
check_zip_contents = ['data/',
'merged/',
'registry/',
'registry/corpus',
'data/corpus/',
'data/corpus/text_editor.avs',
'data/corpus/pos.lexicon',
'data/corpus/simple_pos.huf',
'data/corpus/word.huf',
'data/corpus/text_booktitle.avs',
'data/corpus/word.lexicon.srt',
'data/corpus/word.lexicon.idx',
'data/corpus/simple_pos.crx',
'data/corpus/text_pages.rng',
'data/corpus/simple_pos.crc',
'data/corpus/ner.lexicon',
'data/corpus/lemma.huf',
'data/corpus/text_title.rng',
'data/corpus/text_chapter.avx',
'data/corpus/lemma.lexicon.srt',
'data/corpus/lemma.lexicon.idx',
'data/corpus/text_school.rng',
'data/corpus/text_journal.avs',
'data/corpus/simple_pos.lexicon',
'data/corpus/pos.huf',
'data/corpus/text_editor.avx',
'data/corpus/lemma.crc',
'data/corpus/lemma.lexicon',
'data/corpus/pos.hcd',
'data/corpus/text_title.avx',
'data/corpus/text_institution.avs',
'data/corpus/text_address.avx',
'data/corpus/lemma.corpus.cnt',
'data/corpus/word.crx',
'data/corpus/simple_pos.hcd',
'data/corpus/simple_pos.huf.syn',
'data/corpus/simple_pos.lexicon.srt',
'data/corpus/text_author.avx',
'data/corpus/text_publisher.avs',
'data/corpus/text_chapter.avs',
'data/corpus/ner.corpus.cnt',
'data/corpus/pos.huf.syn',
'data/corpus/text_booktitle.rng',
'data/corpus/lemma.huf.syn',
'data/corpus/pos.corpus.cnt',
'data/corpus/word.lexicon',
'data/corpus/text_publishing_year.avs',
'data/corpus/lemma.hcd',
'data/corpus/text_school.avs',
'data/corpus/text_journal.rng',
'data/corpus/word.corpus.cnt',
'data/corpus/text_school.avx',
'data/corpus/text_journal.avx',
'data/corpus/pos.lexicon.srt',
'data/corpus/text_title.avs',
'data/corpus/word.hcd',
'data/corpus/text_chapter.rng',
'data/corpus/text_address.rng',
'data/corpus/ner.hcd',
'data/corpus/text_publisher.avx',
'data/corpus/text_institution.rng',
'data/corpus/lemma.crx',
'data/corpus/pos.crc',
'data/corpus/text_author.rng',
'data/corpus/text_address.avs',
'data/corpus/pos.lexicon.idx',
'data/corpus/ner.huf',
'data/corpus/ner.huf.syn',
'data/corpus/text_pages.avs',
'data/corpus/text_publishing_year.avx',
'data/corpus/ner.lexicon.idx',
'data/corpus/text.rng',
'data/corpus/word.crc',
'data/corpus/ner.crc',
'data/corpus/text_publisher.rng',
'data/corpus/text_editor.rng',
'data/corpus/text_author.avs',
'data/corpus/s.rng',
'data/corpus/text_publishing_year.rng',
'data/corpus/simple_pos.corpus.cnt',
'data/corpus/simple_pos.lexicon.idx',
'data/corpus/word.huf.syn',
'data/corpus/ner.lexicon.srt',
'data/corpus/text_pages.avx',
'data/corpus/text_booktitle.avx',
'data/corpus/pos.crx',
'data/corpus/ner.crx',
'data/corpus/text_institution.avx',
'merged/corpus.vrt']
......@@ -21,9 +21,9 @@ from .forms import (
EditCorpusFileForm,
ImportCorpusForm
)
from .import_corpus import check_zip_contents
import os
import shutil
import tempfile
import glob
import xml.etree.ElementTree as ET
......@@ -58,26 +58,10 @@ def add_corpus():
)
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/import', methods=['GET', 'POST'])
@login_required
def import_corpus():
abort(503)
form = ImportCorpusForm()
form = ImportCorpusForm(prefix='import-corpus-form')
if form.is_submitted():
if not form.validate():
return make_response(form.errors, 400)
......@@ -87,61 +71,71 @@ def import_corpus():
title=form.title.data
)
db.session.add(corpus)
db.session.flush()
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
os.makedirs(corpus.path)
corpus.makedirs()
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
# Upload zip
archive_file = os.path.join(corpus.path, form.file.data.filename)
form.file.data.save(archive_file)
# Some checks to verify it is a valid exported corpus
with ZipFile(archive_file, 'r') as zip:
contents = zip.namelist()
if set(check_zip_contents).issubset(contents):
# Unzip
shutil.unpack_archive(archive_file, corpus.path)
# Register vrt files to corpus
vrts = glob.glob(corpus.path + '/*.vrt')
for file in vrts:
element_tree = ET.parse(file)
# Save the uploaded zip file in a temporary directory
tmp_dir_base = os.path.join(current_app.config['NOPAQUE_DATA_DIR'], 'tmp') # noqa
with tempfile.TemporaryDirectory(dir=tmp_dir_base) as tmp_dir:
archive_file = os.path.join(tmp_dir, 'corpus.zip')
try:
form.archive.data.save(archive_file)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
flash('Internal Server Error1', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
shutil.unpack_archive(archive_file, extract_dir=tmp_dir)
for vrt_filename in [x for x in os.listdir(tmp_dir) if x.endswith('.vrt')]:
vrt_file = os.path.join(tmp_dir, vrt_filename)
element_tree = ET.parse(vrt_file)
text_node = element_tree.find('text')
corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
author=text_node.get('author'),
corpus=corpus,
editor=text_node.get('editor', 'NULL'),
filename=os.path.basename(file),
institution=text_node.get('institution', 'NULL'),
journal=text_node.get('journal', 'NULL'),
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL')
filename=vrt_filename,
mimetype='application/vrt+xml',
publishing_year=int(text_node.get('publishing_year')),
title=text_node.get('title')
)
if 'address' not in text_node.attrib:
corpus_file.address = text_node.get('address')
if 'booktitle' not in text_node.attrib:
corpus_file.booktitle = text_node.get('booktitle')
if 'chapter' not in text_node.attrib:
corpus_file.chapter = text_node.get('chapter')
if 'editor' not in text_node.attrib:
corpus_file.editor = text_node.get('editor')
if 'institution' not in text_node.attrib:
corpus_file.institution = text_node.get('institution')
if 'journal' not in text_node.attrib:
corpus_file.journal = text_node.get('journal')
if 'pages' not in text_node.attrib:
corpus_file.pages = text_node.get('pages')
if 'publisher' not in text_node.attrib:
corpus_file.publisher = text_node.get('publisher')
if 'school' not in text_node.attrib:
corpus_file.school = text_node.get('school')
db.session.add(corpus_file)
# finish import and redirect to imported corpus
corpus.status = CorpusStatus.BUILT
db.session.commit()
os.remove(archive_file)
flash(f'Corpus "{corpus.title}" imported', 'corpus')
return make_response(
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
else:
# If imported zip is not valid delete corpus and give feedback
flash(
f'Can\'t import corpus "{corpus.title}": Invalid archive file',
category='error'
)
tasks.delete_corpus(corpus.id)
return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa
db.session.flush(objects=[corpus_file])
db.session.refresh(corpus)
current_app.logger.warning(vrt_file)
current_app.logger.warning(corpus_file.path)
try:
shutil.copy2(vrt_file, corpus_file.path)
except Exception as e:
db.session.rollback()
flash('Internal Server Error2', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
db.session.commit()
flash(f'Corpus "{corpus.title}" imported', 'corpus')
return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
return render_template(
'corpora/import_corpus.html.j2',
form=form,
......@@ -173,6 +167,26 @@ def analyse_corpus(corpus_id):
)
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
@bp.route('/<hashid:corpus_id>/delete')
@login_required
def delete_corpus(corpus_id):
......@@ -184,6 +198,73 @@ def delete_corpus(corpus_id):
return redirect(url_for('main.dashboard'))
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
@login_required
def add_corpus_file(corpus_id):
......@@ -271,76 +352,4 @@ def download_corpus_file(corpus_id, corpus_file_id):
attachment_filename=corpus_file.filename,
directory=os.path.dirname(corpus_file.path),
filename=os.path.basename(corpus_file.path)
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
)
\ No newline at end of file
......@@ -31,7 +31,7 @@
</div>
<div class="row">
<div class="col s12">
{{ wtf.render_field(form.file, accept='.zip', placeholder='Choose your exported .zip file') }}
{{ wtf.render_field(form.archive, accept='.zip', placeholder='Choose an exported ZIP archive') }}
</div>
</div>
</div>
......
......@@ -40,7 +40,7 @@
<ul class="pagination"></ul>
</div>
<div class="card-action right-align">
<a class="btn disabled waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
</div>
</div>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment