Skip to content
Snippets Groups Projects
Commit 9d4001f4 authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

Reimplement corpus import and activate it again

parent de4a8358
No related branches found
No related tags found
No related merge requests found
check_zip_contents = ['data/',
'merged/',
'registry/',
'registry/corpus',
'data/corpus/',
'data/corpus/text_editor.avs',
'data/corpus/pos.lexicon',
'data/corpus/simple_pos.huf',
'data/corpus/word.huf',
'data/corpus/text_booktitle.avs',
'data/corpus/word.lexicon.srt',
'data/corpus/word.lexicon.idx',
'data/corpus/simple_pos.crx',
'data/corpus/text_pages.rng',
'data/corpus/simple_pos.crc',
'data/corpus/ner.lexicon',
'data/corpus/lemma.huf',
'data/corpus/text_title.rng',
'data/corpus/text_chapter.avx',
'data/corpus/lemma.lexicon.srt',
'data/corpus/lemma.lexicon.idx',
'data/corpus/text_school.rng',
'data/corpus/text_journal.avs',
'data/corpus/simple_pos.lexicon',
'data/corpus/pos.huf',
'data/corpus/text_editor.avx',
'data/corpus/lemma.crc',
'data/corpus/lemma.lexicon',
'data/corpus/pos.hcd',
'data/corpus/text_title.avx',
'data/corpus/text_institution.avs',
'data/corpus/text_address.avx',
'data/corpus/lemma.corpus.cnt',
'data/corpus/word.crx',
'data/corpus/simple_pos.hcd',
'data/corpus/simple_pos.huf.syn',
'data/corpus/simple_pos.lexicon.srt',
'data/corpus/text_author.avx',
'data/corpus/text_publisher.avs',
'data/corpus/text_chapter.avs',
'data/corpus/ner.corpus.cnt',
'data/corpus/pos.huf.syn',
'data/corpus/text_booktitle.rng',
'data/corpus/lemma.huf.syn',
'data/corpus/pos.corpus.cnt',
'data/corpus/word.lexicon',
'data/corpus/text_publishing_year.avs',
'data/corpus/lemma.hcd',
'data/corpus/text_school.avs',
'data/corpus/text_journal.rng',
'data/corpus/word.corpus.cnt',
'data/corpus/text_school.avx',
'data/corpus/text_journal.avx',
'data/corpus/pos.lexicon.srt',
'data/corpus/text_title.avs',
'data/corpus/word.hcd',
'data/corpus/text_chapter.rng',
'data/corpus/text_address.rng',
'data/corpus/ner.hcd',
'data/corpus/text_publisher.avx',
'data/corpus/text_institution.rng',
'data/corpus/lemma.crx',
'data/corpus/pos.crc',
'data/corpus/text_author.rng',
'data/corpus/text_address.avs',
'data/corpus/pos.lexicon.idx',
'data/corpus/ner.huf',
'data/corpus/ner.huf.syn',
'data/corpus/text_pages.avs',
'data/corpus/text_publishing_year.avx',
'data/corpus/ner.lexicon.idx',
'data/corpus/text.rng',
'data/corpus/word.crc',
'data/corpus/ner.crc',
'data/corpus/text_publisher.rng',
'data/corpus/text_editor.rng',
'data/corpus/text_author.avs',
'data/corpus/s.rng',
'data/corpus/text_publishing_year.rng',
'data/corpus/simple_pos.corpus.cnt',
'data/corpus/simple_pos.lexicon.idx',
'data/corpus/word.huf.syn',
'data/corpus/ner.lexicon.srt',
'data/corpus/text_pages.avx',
'data/corpus/text_booktitle.avx',
'data/corpus/pos.crx',
'data/corpus/ner.crx',
'data/corpus/text_institution.avx',
'merged/corpus.vrt']
...@@ -21,9 +21,9 @@ from .forms import ( ...@@ -21,9 +21,9 @@ from .forms import (
EditCorpusFileForm, EditCorpusFileForm,
ImportCorpusForm ImportCorpusForm
) )
from .import_corpus import check_zip_contents
import os import os
import shutil import shutil
import tempfile
import glob import glob
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
...@@ -58,26 +58,10 @@ def add_corpus(): ...@@ -58,26 +58,10 @@ def add_corpus():
) )
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/import', methods=['GET', 'POST']) @bp.route('/import', methods=['GET', 'POST'])
@login_required @login_required
def import_corpus(): def import_corpus():
abort(503) form = ImportCorpusForm(prefix='import-corpus-form')
form = ImportCorpusForm()
if form.is_submitted(): if form.is_submitted():
if not form.validate(): if not form.validate():
return make_response(form.errors, 400) return make_response(form.errors, 400)
...@@ -87,61 +71,71 @@ def import_corpus(): ...@@ -87,61 +71,71 @@ def import_corpus():
title=form.title.data title=form.title.data
) )
db.session.add(corpus) db.session.add(corpus)
db.session.flush() db.session.flush(objects=[corpus])
db.session.refresh(corpus) db.session.refresh(corpus)
try: try:
os.makedirs(corpus.path) corpus.makedirs()
except OSError as e: except OSError as e:
current_app.logger.error(e) current_app.logger.error(e)
db.session.rollback() db.session.rollback()
flash('Internal Server Error', category='error') flash('Internal Server Error', category='error')
return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
# Upload zip # Save the uploaded zip file in a temporary directory
archive_file = os.path.join(corpus.path, form.file.data.filename) tmp_dir_base = os.path.join(current_app.config['NOPAQUE_DATA_DIR'], 'tmp') # noqa
form.file.data.save(archive_file) with tempfile.TemporaryDirectory(dir=tmp_dir_base) as tmp_dir:
# Some checks to verify it is a valid exported corpus archive_file = os.path.join(tmp_dir, 'corpus.zip')
with ZipFile(archive_file, 'r') as zip: try:
contents = zip.namelist() form.archive.data.save(archive_file)
if set(check_zip_contents).issubset(contents): except OSError as e:
# Unzip current_app.logger.error(e)
shutil.unpack_archive(archive_file, corpus.path) db.session.rollback()
# Register vrt files to corpus flash('Internal Server Error1', category='error')
vrts = glob.glob(corpus.path + '/*.vrt') return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
for file in vrts: shutil.unpack_archive(archive_file, extract_dir=tmp_dir)
element_tree = ET.parse(file) for vrt_filename in [x for x in os.listdir(tmp_dir) if x.endswith('.vrt')]:
vrt_file = os.path.join(tmp_dir, vrt_filename)
element_tree = ET.parse(vrt_file)
text_node = element_tree.find('text') text_node = element_tree.find('text')
corpus_file = CorpusFile( corpus_file = CorpusFile(
address=text_node.get('address', 'NULL'), author=text_node.get('author'),
author=text_node.get('author', 'NULL'),
booktitle=text_node.get('booktitle', 'NULL'),
chapter=text_node.get('chapter', 'NULL'),
corpus=corpus, corpus=corpus,
editor=text_node.get('editor', 'NULL'), filename=vrt_filename,
filename=os.path.basename(file), mimetype='application/vrt+xml',
institution=text_node.get('institution', 'NULL'), publishing_year=int(text_node.get('publishing_year')),
journal=text_node.get('journal', 'NULL'), title=text_node.get('title')
pages=text_node.get('pages', 'NULL'),
publisher=text_node.get('publisher', 'NULL'),
publishing_year=text_node.get('publishing_year', ''),
school=text_node.get('school', 'NULL'),
title=text_node.get('title', 'NULL')
) )
if 'address' not in text_node.attrib:
corpus_file.address = text_node.get('address')
if 'booktitle' not in text_node.attrib:
corpus_file.booktitle = text_node.get('booktitle')
if 'chapter' not in text_node.attrib:
corpus_file.chapter = text_node.get('chapter')
if 'editor' not in text_node.attrib:
corpus_file.editor = text_node.get('editor')
if 'institution' not in text_node.attrib:
corpus_file.institution = text_node.get('institution')
if 'journal' not in text_node.attrib:
corpus_file.journal = text_node.get('journal')
if 'pages' not in text_node.attrib:
corpus_file.pages = text_node.get('pages')
if 'publisher' not in text_node.attrib:
corpus_file.publisher = text_node.get('publisher')
if 'school' not in text_node.attrib:
corpus_file.school = text_node.get('school')
db.session.add(corpus_file) db.session.add(corpus_file)
# finish import and redirect to imported corpus db.session.flush(objects=[corpus_file])
corpus.status = CorpusStatus.BUILT db.session.refresh(corpus)
db.session.commit() current_app.logger.warning(vrt_file)
os.remove(archive_file) current_app.logger.warning(corpus_file.path)
flash(f'Corpus "{corpus.title}" imported', 'corpus') try:
return make_response( shutil.copy2(vrt_file, corpus_file.path)
{'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201) except Exception as e:
else: db.session.rollback()
# If imported zip is not valid delete corpus and give feedback flash('Internal Server Error2', category='error')
flash( return make_response({'redirect_url': url_for('.import_corpus')}, 500) # noqa
f'Can\'t import corpus "{corpus.title}": Invalid archive file', db.session.commit()
category='error' flash(f'Corpus "{corpus.title}" imported', 'corpus')
) return make_response({'redirect_url': url_for('.corpus', corpus_id=corpus.id)}, 201)
tasks.delete_corpus(corpus.id)
return make_response({'redirect_url': url_for('.import_corpus')}, 201) # noqa
return render_template( return render_template(
'corpora/import_corpus.html.j2', 'corpora/import_corpus.html.j2',
form=form, form=form,
...@@ -173,6 +167,26 @@ def analyse_corpus(corpus_id): ...@@ -173,6 +167,26 @@ def analyse_corpus(corpus_id):
) )
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
@bp.route('/<hashid:corpus_id>/delete') @bp.route('/<hashid:corpus_id>/delete')
@login_required @login_required
def delete_corpus(corpus_id): def delete_corpus(corpus_id):
...@@ -184,6 +198,73 @@ def delete_corpus(corpus_id): ...@@ -184,6 +198,73 @@ def delete_corpus(corpus_id):
return redirect(url_for('main.dashboard')) return redirect(url_for('main.dashboard'))
@bp.route('/<hashid:corpus_id>/export')
@login_required
def export_corpus(corpus_id):
abort(503)
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
return send_from_directory(
as_attachment=True,
directory=os.path.join(corpus.user.path, 'corpora'),
filename=corpus.archive_file,
mimetype='zip'
)
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
@bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST']) @bp.route('/<hashid:corpus_id>/files/add', methods=['GET', 'POST'])
@login_required @login_required
def add_corpus_file(corpus_id): def add_corpus_file(corpus_id):
...@@ -271,76 +352,4 @@ def download_corpus_file(corpus_id, corpus_file_id): ...@@ -271,76 +352,4 @@ def download_corpus_file(corpus_id, corpus_file_id):
attachment_filename=corpus_file.filename, attachment_filename=corpus_file.filename,
directory=os.path.dirname(corpus_file.path), directory=os.path.dirname(corpus_file.path),
filename=os.path.basename(corpus_file.path) filename=os.path.basename(corpus_file.path)
) )
\ No newline at end of file
@bp.route('/<hashid:corpus_id>/files/<hashid:corpus_file_id>', methods=['GET', 'POST']) # noqa
@login_required
def corpus_file(corpus_id, corpus_file_id):
corpus_file = CorpusFile.query.filter(
CorpusFile.corpus_id == corpus_id,
CorpusFile.id == corpus_file_id
).first_or_404()
if not (
corpus_file.corpus.user == current_user
or current_user.is_administrator()
):
abort(403)
form = EditCorpusFileForm(prefix='edit-corpus-file-form')
if form.validate_on_submit():
corpus_file.address = form.address.data
corpus_file.author = form.author.data
corpus_file.booktitle = form.booktitle.data
corpus_file.chapter = form.chapter.data
corpus_file.editor = form.editor.data
corpus_file.institution = form.institution.data
corpus_file.journal = form.journal.data
corpus_file.pages = form.pages.data
corpus_file.publisher = form.publisher.data
corpus_file.publishing_year = form.publishing_year.data
corpus_file.school = form.school.data
corpus_file.title = form.title.data
corpus_file.corpus.status = CorpusStatus.UNPREPARED
db.session.commit()
flash(f'Corpus file "{corpus_file.filename}" edited', category='corpus') # noqa
return redirect(url_for('.corpus', corpus_id=corpus_id))
# If no form is submitted or valid, fill out fields with current values
form.address.data = corpus_file.address
form.author.data = corpus_file.author
form.booktitle.data = corpus_file.booktitle
form.chapter.data = corpus_file.chapter
form.editor.data = corpus_file.editor
form.institution.data = corpus_file.institution
form.journal.data = corpus_file.journal
form.pages.data = corpus_file.pages
form.publisher.data = corpus_file.publisher
form.publishing_year.data = corpus_file.publishing_year
form.school.data = corpus_file.school
form.title.data = corpus_file.title
return render_template(
'corpora/corpus_file.html.j2',
corpus=corpus_file.corpus,
corpus_file=corpus_file,
form=form,
title='Edit corpus file'
)
@bp.route('/<hashid:corpus_id>/build')
@login_required
def build_corpus(corpus_id):
corpus = Corpus.query.get_or_404(corpus_id)
if not (corpus.user == current_user or current_user.is_administrator()):
abort(403)
if corpus.files.all():
tasks.build_corpus(corpus_id)
flash(
f'Corpus "{corpus.title}" marked for building',
category='corpus'
)
else:
flash(
f'Can\'t build corpus "{corpus.title}": No corpus file(s)',
category='error'
)
return redirect(url_for('.corpus', corpus_id=corpus_id))
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
</div> </div>
<div class="row"> <div class="row">
<div class="col s12"> <div class="col s12">
{{ wtf.render_field(form.file, accept='.zip', placeholder='Choose your exported .zip file') }} {{ wtf.render_field(form.archive, accept='.zip', placeholder='Choose an exported ZIP archive') }}
</div> </div>
</div> </div>
</div> </div>
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
<ul class="pagination"></ul> <ul class="pagination"></ul>
</div> </div>
<div class="card-action right-align"> <div class="card-action right-align">
<a class="btn disabled waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a> <a class="btn waves-effect waves-light" href="{{ url_for('corpora.import_corpus') }}"><i class="material-icons right">import_export</i>Import Corpus</a>
<a class="btn waves-effect waves-light" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a> <a class="btn waves-effect waves-light" href="{{ url_for('corpora.add_corpus') }}">New corpus<i class="material-icons right">add</i></a>
</div> </div>
</div> </div>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment