Newer
Older
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publisher_url': self.publisher_url,
'publishing_url': self.publishing_url,
'publishing_year': self.publishing_year,
'is_public': self.is_public,
**self.file_mixin_to_json_serializeable()
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'spacy_nlp_pipeline_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
title = db.Column(db.String(64))
description = db.Column(db.String(255))
version = db.Column(db.String(16))
compatible_service_versions = db.Column(ContainerColumn(list, 255))
publisher = db.Column(db.String(128))
publisher_url = db.Column(db.String(512))
publishing_url = db.Column(db.String(512))
publishing_year = db.Column(db.Integer)
pipeline_name = db.Column(db.String(64))
is_public = db.Column(db.Boolean, default=False)
# Relationships
user = db.relationship('User', back_populates='spacy_nlp_pipeline_models')
@property
def path(self):
return os.path.join(
self.user.path,
'spacy_nlp_pipeline_models',
str(self.id)
)
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/spacy_nlp_pipeline_models/{self.hashid}'
@property
def url(self):
return url_for(
'contributions.spacy_nlp_pipeline_model',
spacy_nlp_pipeline_model_id=self.id
)
@property
def user_hashid(self):
return self.user.hashid
def insert_defaults(force_download=False):
nopaque_user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'SpaCyNLPPipelineModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
if model is not None:
model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description']
model.filename = m['url'].split('/')[-1]
model.publisher = m['publisher']
model.publisher_url = m['publisher_url']
model.publishing_url = m['publishing_url']
model.publishing_year = m['publishing_year']
model.is_public = True
model.title = m['title']
model.version = m['version']
model.pipeline_name = m['pipeline_name']
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
else:
model = SpaCyNLPPipelineModel(
compatible_service_versions=m['compatible_service_versions'],
description=m['description'],
filename=m['url'].split('/')[-1],
publisher=m['publisher'],
publisher_url=m['publisher_url'],
publishing_url=m['publishing_url'],
publishing_year=m['publishing_year'],
is_public=True,
title=m['title'],
user=nopaque_user,
version=m['version'],
pipeline_name=m['pipeline_name']
)
db.session.add(model)
db.session.flush(objects=[model])
db.session.refresh(model)
if not os.path.exists(model.path) or force_download:
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{model.title} ({model.filename})',
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(model.path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()

Patrick Jentsch
committed
db.session.commit()
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)

Patrick Jentsch
committed
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publisher_url': self.publisher_url,
'publishing_url': self.publishing_url,
'publishing_year': self.publishing_year,
'is_public': self.is_public,
**self.file_mixin_to_json_serializeable()
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
if relationships:
pass

Patrick Jentsch
committed
class JobInput(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_inputs'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
# Relationships
job = db.relationship(
'Job',
back_populates='inputs'
)
def __repr__(self):
return f'<JobInput {self.filename}>'
return url_for(
'jobs.download_job_input',
job_id=self.job.id,
job_input_id=self.id
)
@property
def jsonpatch_path(self):
return f'{self.job.jsonpatch_path}/inputs/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):

Patrick Jentsch
committed
return os.path.join(self.job.path, 'inputs', str(self.id))

Patrick Jentsch
committed
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-input-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
@property
def user_id(self):
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
**self.file_mixin_to_json_serializeable()
json_serializeable['job'] = \
self.job.to_json_serializeable(backrefs=True)
class JobResult(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_results'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))

Patrick Jentsch
committed
# Fields
description = db.Column(db.String(255))
# Relationships
job = db.relationship(
'Job',
back_populates='results'
)
def __repr__(self):
return f'<JobResult {self.filename}>'
@property
def download_url(self):
return url_for(
'jobs.download_job_result',
job_id=self.job_id,
job_result_id=self.id
)
@property
def jsonpatch_path(self):
return f'{self.job.jsonpatch_path}/results/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):

Patrick Jentsch
committed
return os.path.join(self.job.path, 'results', str(self.id))

Patrick Jentsch
committed
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-result-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
@property
def user_id(self):
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'description': self.description,
**self.file_mixin_to_json_serializeable(
backrefs=backrefs,
relationships=relationships
)
}
if backrefs:
json_serializeable['job'] = \
self.job.to_json_serializeable(backrefs=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
creation_date = \
db.Column(db.DateTime(), default=datetime.utcnow)
end_date = db.Column(db.DateTime())
service_args = db.Column(ContainerColumn(dict, 255))
status = db.Column(
default=JobStatus.INITIALIZING
)
inputs = db.relationship(
'JobInput',
cascade='all, delete-orphan',
lazy='dynamic'
)
results = db.relationship(
'JobResult',
cascade='all, delete-orphan',
lazy='dynamic'
)
user = db.relationship(
'User',
back_populates='jobs'
)
def __repr__(self):
return f'<Job {self.title}>'

Patrick Jentsch
committed
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/jobs/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):
return os.path.join(self.user.path, 'jobs', str(self.id))
def url(self):
return url_for('jobs.job', job_id=self.id)
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def create(**kwargs):
job = Job(**kwargs)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
os.mkdir(job.path)
os.mkdir(os.path.join(job.path, 'inputs'))
os.mkdir(os.path.join(job.path, 'pipeline_data'))
os.mkdir(os.path.join(job.path, 'results'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return job
''' Delete the job and its inputs and results from the database. '''

Patrick Jentsch
committed
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
self.status = JobStatus.CANCELING
db.session.commit()

Patrick Jentsch
committed
while self.status != JobStatus.CANCELED:
# In case the daemon handled a job in any way

Patrick Jentsch
committed
if self.status != JobStatus.CANCELING:
self.status = JobStatus.CANCELING
db.session.commit()
sleep(1)
db.session.refresh(self)
try:
shutil.rmtree(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
''' Restart a job - only if the status is failed '''
if self.status != JobStatus.FAILED:
raise Exception('Job status is not "failed"')

Patrick Jentsch
committed
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True)
for result in self.results:
db.session.delete(result)

Patrick Jentsch
committed
self.status = JobStatus.SUBMITTED
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'creation_date': f'{self.creation_date.isoformat()}Z',
'description': self.description,
'end_date': (
None if self.end_date is None
else f'{self.end_date.isoformat()}Z'
),
'service': self.service,
'service_args': self.service_args,
'service_version': self.service_version,

Patrick Jentsch
committed
'status': self.status.name,
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
json_serializeable['inputs'] = {
x.hashid: x.to_json_serializeable(relationships=True)
json_serializeable['results'] = {
x.hashid: x.to_json_serializeable(relationships=True)

Patrick Jentsch
committed
class CorpusFile(FileMixin, HashidMixin, db.Model):
__tablename__ = 'corpus_files'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
description = db.Column(db.String(255))
publishing_year = db.Column(db.Integer)
title = db.Column(db.String(255))
address = db.Column(db.String(255))
booktitle = db.Column(db.String(255))
chapter = db.Column(db.String(255))
editor = db.Column(db.String(255))
institution = db.Column(db.String(255))
journal = db.Column(db.String(255))
pages = db.Column(db.String(255))
publisher = db.Column(db.String(255))
school = db.Column(db.String(255))
# Relationships
corpus = db.relationship(
'Corpus',
back_populates='files'
)
@property
def download_url(self):
return url_for(
'corpora.download_corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
@property
def jsonpatch_path(self):

Patrick Jentsch
committed
@property
def path(self):

Patrick Jentsch
committed
return os.path.join(self.corpus.path, 'files', str(self.id))

Patrick Jentsch
committed
return url_for(
'corpora.corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
@property
def user_hashid(self):
return self.corpus.user.hashid
@property
def user_id(self):
return self.corpus.user_id

Patrick Jentsch
committed
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)

Patrick Jentsch
committed
self.corpus.status = CorpusStatus.UNPREPARED
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'address': self.address,
'author': self.author,
'booktitle': self.booktitle,
'chapter': self.chapter,
'editor': self.editor,
'institution': self.institution,
'journal': self.journal,
'pages': self.pages,
'publisher': self.publisher,
'publishing_year': self.publishing_year,
'school': self.school,
'title': self.title,
**self.file_mixin_to_json_serializeable(
backrefs=backrefs,
relationships=relationships
)
json_serializeable['corpus'] = \
self.corpus.to_json_serializeable(backrefs=True)
__tablename__ = 'corpora'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
description = db.Column(db.String(255))
status = db.Column(
default=CorpusStatus.UNPREPARED
)
num_analysis_sessions = db.Column(db.Integer, default=0)
num_tokens = db.Column(db.Integer, default=0)
is_public = db.Column(db.Boolean, default=False)
files = db.relationship(
'CorpusFile',
lazy='dynamic',
cascade='all, delete-orphan'
)
corpus_follower_associations = db.relationship(
cascade='all, delete-orphan'
followers = association_proxy(
'corpus_follower_associations',
'follower',
creator=lambda u: CorpusFollowerAssociation(follower=u)
)
user = db.relationship('User', back_populates='corpora')

Patrick Jentsch
committed
# "static" attributes
def __repr__(self):
return f'<Corpus {self.title}>'
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):
return os.path.join(self.user.path, 'corpora', str(self.id))

Patrick Jentsch
committed
@property
def url(self):
return url_for('corpora.corpus', corpus_id=self.id)
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def create(**kwargs):
corpus = Corpus(**kwargs)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
os.mkdir(corpus.path)
os.mkdir(os.path.join(corpus.path, 'files'))
os.mkdir(os.path.join(corpus.path, 'cwb'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return corpus

Patrick Jentsch
committed
build_dir = os.path.join(self.path, 'cwb')
shutil.rmtree(build_dir, ignore_errors=True)
os.mkdir(build_dir)
os.mkdir(os.path.join(build_dir, 'data'))
os.mkdir(os.path.join(build_dir, 'registry'))
corpus_element = ET.fromstring('<corpus>\n</corpus>')

Patrick Jentsch
committed
normalized_vrt_path = os.path.join(build_dir, f'{corpus_file.id}.norm.vrt')
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)

Patrick Jentsch
committed
text_element = element_tree.getroot()
text_element.set('author', corpus_file.author)
text_element.set('title', corpus_file.title)
text_element.set(
'publishing_year',
f'{corpus_file.publishing_year}'
)
text_element.set('address', corpus_file.address or 'NULL')

Patrick Jentsch
committed
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_element.set('chapter', corpus_file.chapter or 'NULL')
text_element.set('editor', corpus_file.editor or 'NULL')
text_element.set('institution', corpus_file.institution or 'NULL')
text_element.set('journal', corpus_file.journal or 'NULL')
text_element.set('pages', f'{corpus_file.pages}' or 'NULL')

Patrick Jentsch
committed
text_element.set('publisher', corpus_file.publisher or 'NULL')
text_element.set('school', corpus_file.school or 'NULL')
text_element.tail = '\n'
# corpus_element.insert(1, text_element)
corpus_element.append(text_element)

Patrick Jentsch
committed
ET.ElementTree(corpus_element).write(

Patrick Jentsch
committed
os.path.join(build_dir, 'corpus.vrt'),

Patrick Jentsch
committed
encoding='utf-8'
)

Patrick Jentsch
committed
self.status = CorpusStatus.SUBMITTED

Patrick Jentsch
committed
shutil.rmtree(self.path, ignore_errors=True)

Stephan Porada
committed
db.session.delete(self)
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'creation_date': f'{self.creation_date.isoformat()}Z',
'description': self.description,
'max_num_tokens': self.max_num_tokens,
'num_analysis_sessions': self.num_analysis_sessions,
'num_tokens': self.num_tokens,

Patrick Jentsch
committed
'status': self.status.name,
'title': self.title,
'is_public': self.is_public
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
json_serializeable['corpus_follower_associations'] = {
for x in self.corpus_follower_associations
json_serializeable['files'] = {
x.hashid: x.to_json_serializeable(relationships=True)
# endregion models
##############################################################################
# event_handlers #
##############################################################################
# region event_handlers
@db.event.listens_for(Corpus, 'after_delete')
@db.event.listens_for(CorpusFile, 'after_delete')
@db.event.listens_for(Job, 'after_delete')
@db.event.listens_for(JobInput, 'after_delete')
@db.event.listens_for(JobResult, 'after_delete')
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_delete')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_delete')
def resource_after_delete(mapper, connection, resource):
jsonpatch = [
{
'op': 'remove',
'path': resource.jsonpatch_path
}
]
room = f'/users/{resource.user_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(CorpusFollowerAssociation, 'after_delete')
def cfa_after_delete_handler(mapper, connection, cfa):
jsonpatch_path = f'/users/{cfa.corpus.user.hashid}/corpora/{cfa.corpus.hashid}/corpus_follower_associations/{cfa.hashid}'
jsonpatch = [
{
'op': 'remove',
'path': jsonpatch_path
}
room = f'/users/{cfa.corpus.user.hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Corpus, 'after_insert')
@db.event.listens_for(CorpusFile, 'after_insert')
@db.event.listens_for(Job, 'after_insert')
@db.event.listens_for(JobInput, 'after_insert')
@db.event.listens_for(JobResult, 'after_insert')
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_insert')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_insert')
def resource_after_insert_handler(mapper, connection, resource):
jsonpatch_value = resource.to_json_serializeable()
for attr in mapper.relationships:
jsonpatch = [
{
'op': 'add',
'path': resource.jsonpatch_path,
'value': jsonpatch_value
}
room = f'/users/{resource.user_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(CorpusFollowerAssociation, 'after_insert')
def cfa_after_insert_handler(mapper, connection, cfa):
jsonpatch_value = cfa.to_json_serializeable()
jsonpatch_path = f'/users/{cfa.corpus.user.hashid}/corpora/{cfa.corpus.hashid}/corpus_follower_associations/{cfa.hashid}'
jsonpatch = [
{
'op': 'add',
'path': jsonpatch_path,
'value': jsonpatch_value
}
room = f'/users/{cfa.corpus.user.hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Corpus, 'after_update')
@db.event.listens_for(CorpusFile, 'after_update')
@db.event.listens_for(Job, 'after_update')
@db.event.listens_for(JobInput, 'after_update')
@db.event.listens_for(JobResult, 'after_update')
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_update')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_update')
def resource_after_update_handler(mapper, connection, resource):
jsonpatch = []
for attr in db.inspect(resource).attrs:
if attr.key in mapper.relationships:
continue
if not attr.load_history().has_changes():
continue
jsonpatch_path = f'{resource.jsonpatch_path}/{attr.key}'
if isinstance(attr.value, datetime):
jsonpatch_value = f'{attr.value.isoformat()}Z'
elif isinstance(attr.value, Enum):
jsonpatch.append(
{
'op': 'replace',
'path': jsonpatch_path,
'value': jsonpatch_value
}
)
if jsonpatch:
room = f'/users/{resource.user_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Job, 'after_update')
def job_after_update_handler(mapper, connection, job):
for attr in db.inspect(job).attrs:
if attr.key != 'status':
continue
if not attr.load_history().has_changes():
return
if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.NONE:
return
if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.END:
if job.status not in [JobStatus.COMPLETED, JobStatus.FAILED]:
return
msg = create_message(
job.user.email,
f'Status update for your Job "{job.title}"',
'tasks/email/notification',
job=job
)
mail.send(msg)
# endregion event_handlers
##############################################################################
# misc #
##############################################################################
# region misc
@login.user_loader
def load_user(user_id):
return User.query.get(int(user_id))
# endregion misc