Newer
Older
pbar.close()
db.session.commit()
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publisher_url': self.publisher_url,
'publishing_url': self.publishing_url,
'publishing_year': self.publishing_year,
'is_public': self.is_public,
**self.file_mixin_to_json_serializeable()
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
class SpaCyNLPPipelineModel(FileMixin, HashidMixin, db.Model):
__tablename__ = 'spacy_nlp_pipeline_models'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
# Fields
title = db.Column(db.String(64))
description = db.Column(db.String(255))
version = db.Column(db.String(16))
compatible_service_versions = db.Column(ContainerColumn(list, 255))
publisher = db.Column(db.String(128))
publisher_url = db.Column(db.String(512))
publishing_url = db.Column(db.String(512))
publishing_year = db.Column(db.Integer)
pipeline_name = db.Column(db.String(64))
is_public = db.Column(db.Boolean, default=False)
# Relationships
user = db.relationship('User', back_populates='spacy_nlp_pipeline_models')
@property
def path(self):
return os.path.join(
self.user.path,
'spacy_nlp_pipeline_models',
str(self.id)
)
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/spacy_nlp_pipeline_models/{self.hashid}'
@property
def url(self):
return url_for(
'contributions.spacy_nlp_pipeline_model',
spacy_nlp_pipeline_model_id=self.id
)
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def insert_defaults():
nopaque_user = User.query.filter_by(username='nopaque').first()
defaults_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'SpaCyNLPPipelineModel.defaults.yml'
)
with open(defaults_file, 'r') as f:
defaults = yaml.safe_load(f)
for m in defaults:
model = SpaCyNLPPipelineModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa
if model is not None:
model.compatible_service_versions = m['compatible_service_versions']
model.description = m['description']
model.publisher = m['publisher']
model.publisher_url = m['publisher_url']
model.publishing_url = m['publishing_url']
model.publishing_year = m['publishing_year']
model.is_public = True
model.title = m['title']
model.version = m['version']
model.pipeline_name = m['pipeline_name']
continue
model = SpaCyNLPPipelineModel(
compatible_service_versions=m['compatible_service_versions'],
description=m['description'],
publisher=m['publisher'],
publisher_url=m['publisher_url'],
publishing_url=m['publishing_url'],
publishing_year=m['publishing_year'],
is_public=True,
title=m['title'],
user=nopaque_user,
version=m['version'],
pipeline_name=m['pipeline_name']
)
db.session.add(model)
db.session.flush(objects=[model])
db.session.refresh(model)
r = requests.get(m['url'], stream=True)
pbar = tqdm(
desc=f'{model.title} ({model.filename})',
unit="B",
unit_scale=True,
unit_divisor=1024,
total=int(r.headers['Content-Length'])
)
pbar.clear()
with open(model.path, 'wb') as f:

Patrick Jentsch
committed
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
db.session.commit()
def delete(self):
try:
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.delete(self)

Patrick Jentsch
committed
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'compatible_service_versions': self.compatible_service_versions,
'description': self.description,
'publisher': self.publisher,
'publisher_url': self.publisher_url,
'publishing_url': self.publishing_url,
'publishing_year': self.publishing_year,
'is_public': self.is_public,
**self.file_mixin_to_json_serializeable()
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
if relationships:
pass

Patrick Jentsch
committed
class JobInput(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_inputs'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))
# Relationships
job = db.relationship(
'Job',
back_populates='inputs'
)
def __repr__(self):
return f'<JobInput {self.filename}>'
return url_for(
'jobs.download_job_input',
job_id=self.job.id,
job_input_id=self.id
)
@property
def jsonpatch_path(self):
return f'{self.job.jsonpatch_path}/inputs/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):

Patrick Jentsch
committed
return os.path.join(self.job.path, 'inputs', str(self.id))

Patrick Jentsch
committed
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-input-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
@property
def user_id(self):
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
**self.file_mixin_to_json_serializeable()
json_serializeable['job'] = \
self.job.to_json_serializeable(backrefs=True)
class JobResult(FileMixin, HashidMixin, db.Model):
__tablename__ = 'job_results'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
job_id = db.Column(db.Integer, db.ForeignKey('jobs.id'))

Patrick Jentsch
committed
# Fields
description = db.Column(db.String(255))
# Relationships
job = db.relationship(
'Job',
back_populates='results'
)
def __repr__(self):
return f'<JobResult {self.filename}>'
@property
def download_url(self):
return url_for(
'jobs.download_job_result',
job_id=self.job_id,
job_result_id=self.id
)
@property
def jsonpatch_path(self):
return f'{self.job.jsonpatch_path}/results/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):

Patrick Jentsch
committed
return os.path.join(self.job.path, 'results', str(self.id))

Patrick Jentsch
committed
return url_for(
'jobs.job',
job_id=self.job_id,
_anchor=f'job-{self.job.hashid}-result-{self.hashid}'
)
@property
def user_hashid(self):
return self.job.user.hashid
@property
def user_id(self):
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'description': self.description,
**self.file_mixin_to_json_serializeable(
backrefs=backrefs,
relationships=relationships
)
}
if backrefs:
json_serializeable['job'] = \
self.job.to_json_serializeable(backrefs=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
creation_date = \
db.Column(db.DateTime(), default=datetime.utcnow)
end_date = db.Column(db.DateTime())
service_args = db.Column(ContainerColumn(dict, 255))
status = db.Column(
default=JobStatus.INITIALIZING
)
inputs = db.relationship(
'JobInput',
cascade='all, delete-orphan',
lazy='dynamic'
)
results = db.relationship(
'JobResult',
cascade='all, delete-orphan',
lazy='dynamic'
)
user = db.relationship(
'User',
back_populates='jobs'
)
def __repr__(self):
return f'<Job {self.title}>'

Patrick Jentsch
committed
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/jobs/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):
return os.path.join(self.user.path, 'jobs', str(self.id))
def url(self):
return url_for('jobs.job', job_id=self.id)
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def create(**kwargs):
job = Job(**kwargs)
db.session.add(job)
db.session.flush(objects=[job])
db.session.refresh(job)
try:
os.mkdir(job.path)
os.mkdir(os.path.join(job.path, 'inputs'))
os.mkdir(os.path.join(job.path, 'pipeline_data'))
os.mkdir(os.path.join(job.path, 'results'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return job
''' Delete the job and its inputs and results from the database. '''

Patrick Jentsch
committed
if self.status not in [JobStatus.COMPLETED, JobStatus.FAILED]: # noqa
self.status = JobStatus.CANCELING
db.session.commit()

Patrick Jentsch
committed
while self.status != JobStatus.CANCELED:
# In case the daemon handled a job in any way

Patrick Jentsch
committed
if self.status != JobStatus.CANCELING:
self.status = JobStatus.CANCELING
db.session.commit()
sleep(1)
db.session.refresh(self)
try:
shutil.rmtree(self.path)
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
''' Restart a job - only if the status is failed '''
if self.status != JobStatus.FAILED:
raise Exception('Job status is not "failed"')

Patrick Jentsch
committed
shutil.rmtree(os.path.join(self.path, 'results'), ignore_errors=True)
shutil.rmtree(os.path.join(self.path, 'pyflow.data'), ignore_errors=True)
for result in self.results:
db.session.delete(result)

Patrick Jentsch
committed
self.status = JobStatus.SUBMITTED
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'creation_date': f'{self.creation_date.isoformat()}Z',
'description': self.description,
'end_date': (
None if self.end_date is None
else f'{self.end_date.isoformat()}Z'
),
'service': self.service,
'service_args': self.service_args,
'service_version': self.service_version,

Patrick Jentsch
committed
'status': self.status.name,
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
json_serializeable['inputs'] = {
x.hashid: x.to_json_serializeable(relationships=True)
json_serializeable['results'] = {
x.hashid: x.to_json_serializeable(relationships=True)

Patrick Jentsch
committed
class CorpusFile(FileMixin, HashidMixin, db.Model):
__tablename__ = 'corpus_files'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
corpus_id = db.Column(db.Integer, db.ForeignKey('corpora.id'))
description = db.Column(db.String(255))
publishing_year = db.Column(db.Integer)
title = db.Column(db.String(255))
address = db.Column(db.String(255))
booktitle = db.Column(db.String(255))
chapter = db.Column(db.String(255))
editor = db.Column(db.String(255))
institution = db.Column(db.String(255))
journal = db.Column(db.String(255))
pages = db.Column(db.String(255))
publisher = db.Column(db.String(255))
school = db.Column(db.String(255))
# Relationships
corpus = db.relationship(
'Corpus',
back_populates='files'
)
@property
def download_url(self):
return url_for(
'corpora.download_corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
@property
def jsonpatch_path(self):

Patrick Jentsch
committed
@property
def path(self):

Patrick Jentsch
committed
return os.path.join(self.corpus.path, 'files', str(self.id))

Patrick Jentsch
committed
return url_for(
'corpora.corpus_file',
corpus_id=self.corpus_id,
corpus_file_id=self.id
)
@property
def user_hashid(self):
return self.corpus.user.hashid
@property
def user_id(self):
return self.corpus.user_id

Patrick Jentsch
committed
os.remove(self.path)
except OSError as e:
current_app.logger.error(e)

Patrick Jentsch
committed
self.corpus.status = CorpusStatus.UNPREPARED
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'id': self.hashid,
'address': self.address,
'author': self.author,
'booktitle': self.booktitle,
'chapter': self.chapter,
'editor': self.editor,
'institution': self.institution,
'journal': self.journal,
'pages': self.pages,
'publisher': self.publisher,
'publishing_year': self.publishing_year,
'school': self.school,
'title': self.title,
**self.file_mixin_to_json_serializeable(
backrefs=backrefs,
relationships=relationships
)
json_serializeable['corpus'] = \
self.corpus.to_json_serializeable(backrefs=True)
__tablename__ = 'corpora'
# Primary key
id = db.Column(db.Integer, primary_key=True)
# Foreign keys
user_id = db.Column(db.Integer, db.ForeignKey('users.id'))
creation_date = db.Column(db.DateTime(), default=datetime.utcnow)
description = db.Column(db.String(255))
status = db.Column(
default=CorpusStatus.UNPREPARED
)
num_analysis_sessions = db.Column(db.Integer, default=0)
num_tokens = db.Column(db.Integer, default=0)
is_public = db.Column(db.Boolean, default=False)
files = db.relationship(
'CorpusFile',
lazy='dynamic',
cascade='all, delete-orphan'
)
corpus_follower_associations = db.relationship(
cascade='all, delete-orphan'
followers = association_proxy(
'corpus_follower_associations',
'follower',
creator=lambda u: CorpusFollowerAssociation(follower=u)
)
user = db.relationship('User', back_populates='corpora')

Patrick Jentsch
committed
# "static" attributes
def __repr__(self):
return f'<Corpus {self.title}>'
@property
def jsonpatch_path(self):
return f'{self.user.jsonpatch_path}/corpora/{self.hashid}'

Patrick Jentsch
committed
@property
def path(self):
return os.path.join(self.user.path, 'corpora', str(self.id))

Patrick Jentsch
committed
@property
def url(self):
return url_for('corpora.corpus', corpus_id=self.id)
@property
def user_hashid(self):
return self.user.hashid
@staticmethod
def create(**kwargs):
corpus = Corpus(**kwargs)
db.session.add(corpus)
db.session.flush(objects=[corpus])
db.session.refresh(corpus)
try:
os.mkdir(corpus.path)
os.mkdir(os.path.join(corpus.path, 'files'))
os.mkdir(os.path.join(corpus.path, 'cwb'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'data'))
os.mkdir(os.path.join(corpus.path, 'cwb', 'registry'))
except OSError as e:
current_app.logger.error(e)
db.session.rollback()
raise e
return corpus
corpus_element = ET.fromstring('<corpus>\n</corpus>')
normalized_vrt_path = os.path.join(self.path, 'cwb', f'{corpus_file.id}.norm.vrt')
try:
normalize_vrt_file(corpus_file.path, normalized_vrt_path)
except:
self.status = CorpusStatus.FAILED
return
element_tree = ET.parse(normalized_vrt_path)

Patrick Jentsch
committed
text_element = element_tree.getroot()
text_element.set('author', corpus_file.author)
text_element.set('title', corpus_file.title)
text_element.set(
'publishing_year',
f'{corpus_file.publishing_year}'
)
text_element.set('address', corpus_file.address or 'NULL')

Patrick Jentsch
committed
text_element.set('booktitle', corpus_file.booktitle or 'NULL')
text_element.set('chapter', corpus_file.chapter or 'NULL')
text_element.set('editor', corpus_file.editor or 'NULL')
text_element.set('institution', corpus_file.institution or 'NULL')
text_element.set('journal', corpus_file.journal or 'NULL')
text_element.set('pages', f'{corpus_file.pages}' or 'NULL')

Patrick Jentsch
committed
text_element.set('publisher', corpus_file.publisher or 'NULL')
text_element.set('school', corpus_file.school or 'NULL')
text_element.tail = '\n'
# corpus_element.insert(1, text_element)
corpus_element.append(text_element)

Patrick Jentsch
committed
ET.ElementTree(corpus_element).write(
os.path.join(self.path, 'cwb', 'corpus.vrt'),
encoding='utf-8'
)

Patrick Jentsch
committed
self.status = CorpusStatus.SUBMITTED

Patrick Jentsch
committed
shutil.rmtree(self.path, ignore_errors=True)

Stephan Porada
committed
db.session.delete(self)
def to_json_serializeable(self, backrefs=False, relationships=False):
json_serializeable = {
'creation_date': f'{self.creation_date.isoformat()}Z',
'description': self.description,
'max_num_tokens': self.max_num_tokens,
'num_analysis_sessions': self.num_analysis_sessions,
'num_tokens': self.num_tokens,

Patrick Jentsch
committed
'status': self.status.name,
'title': self.title,
'is_public': self.is_public
json_serializeable['user'] = \
self.user.to_json_serializeable(backrefs=True)
json_serializeable['corpus_follower_associations'] = {
for x in self.corpus_follower_associations
json_serializeable['files'] = {
x.hashid: x.to_json_serializeable(relationships=True)
# endregion models
##############################################################################
# event_handlers #
##############################################################################
# region event_handlers
@db.event.listens_for(Corpus, 'after_delete')
@db.event.listens_for(CorpusFile, 'after_delete')
@db.event.listens_for(Job, 'after_delete')
@db.event.listens_for(JobInput, 'after_delete')
@db.event.listens_for(JobResult, 'after_delete')
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_delete')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_delete')
def ressource_after_delete(mapper, connection, ressource):
jsonpatch = [{'op': 'remove', 'path': ressource.jsonpatch_path}]
room = f'/users/{ressource.user_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(CorpusFollowerAssociation, 'after_delete')
def corpus_follower_association_after_delete_handler(mapper, connection, ressource):
corpus_owner_hashid = ressource.corpus.user.hashid
corpus_hashid = hashids.encode(ressource.corpus_id)
follower_hashid = hashids.encode(ressource.follower_id)
# Send a PATCH to the corpus owner
jsonpatch_path = f'/users/{corpus_owner_hashid}/corpora/{corpus_hashid}/corpus_follower_associations/{ressource.hashid}'
jsonpatch = [{'op': 'remove', 'path': jsonpatch_path}]
room = f'/users/{corpus_owner_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
# Send a PATCH to the follower
jsonpatch_path = f'/users/{follower_hashid}/corpus_follower_associations/{ressource.hashid}'
jsonpatch = [{'op': 'remove', 'path': jsonpatch_path}]
room = f'/users/{follower_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Corpus, 'after_insert')
@db.event.listens_for(CorpusFile, 'after_insert')
@db.event.listens_for(Job, 'after_insert')
@db.event.listens_for(JobInput, 'after_insert')
@db.event.listens_for(JobResult, 'after_insert')
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_insert')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_insert')
def ressource_after_insert_handler(mapper, connection, ressource):
value = ressource.to_json_serializeable()
for attr in mapper.relationships:
value[attr.key] = {}
jsonpatch = [
{'op': 'add', 'path': ressource.jsonpatch_path, 'value': value}
]
room = f'/users/{ressource.user_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(CorpusFollowerAssociation, 'after_insert')
def corpus_follower_association_after_insert_handler(mapper, connection, ressource):
corpus_owner_hashid = ressource.corpus.user.hashid
corpus_hashid = hashids.encode(ressource.corpus_id)
follower_hashid = hashids.encode(ressource.follower_id)
value = ressource.to_json_serializeable()
# Send a PATCH to the corpus owner
jsonpatch_path = f'/users/{corpus_owner_hashid}/corpora/{corpus_hashid}/corpus_follower_associations/{ressource.hashid}'
jsonpatch = [{'op': 'add', 'path': jsonpatch_path, 'value': value}]
room = f'/users/{corpus_owner_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
# Send a PATCH to the follower
jsonpatch_path = f'/users/{follower_hashid}/corpus_follower_associations/{ressource.hashid}'
jsonpatch = [{'op': 'add', 'path': jsonpatch_path, 'value': value}]
room = f'/users/{follower_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Corpus, 'after_update')
@db.event.listens_for(CorpusFile, 'after_update')
@db.event.listens_for(Job, 'after_update')
@db.event.listens_for(JobInput, 'after_update')
@db.event.listens_for(JobResult, 'after_update')
@db.event.listens_for(SpaCyNLPPipelineModel, 'after_update')
@db.event.listens_for(TesseractOCRPipelineModel, 'after_update')
def ressource_after_update_handler(mapper, connection, ressource):
jsonpatch = []
for attr in db.inspect(ressource).attrs:
if attr.key in mapper.relationships:
continue
if not attr.load_history().has_changes():
continue
if isinstance(attr.value, datetime):
elif isinstance(attr.value, Enum):
value = attr.value.name
else:
value = attr.value
jsonpatch.append(
{
'op': 'replace',
'path': f'{ressource.jsonpatch_path}/{attr.key}',
'value': value
}
)
if jsonpatch:
room = f'/users/{ressource.user_hashid}'
socketio.emit('PATCH', jsonpatch, room=room)
@db.event.listens_for(Job, 'after_update')
def job_after_update_handler(mapper, connection, job):
for attr in db.inspect(job).attrs:
if attr.key != 'status':
continue
if not attr.load_history().has_changes():
return
if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.NONE:
return
if job.user.setting_job_status_mail_notification_level == UserSettingJobStatusMailNotificationLevel.END:
if job.status not in [JobStatus.COMPLETED, JobStatus.FAILED]:
return
msg = create_message(
job.user.email,
f'Status update for your Job "{job.title}"',
'tasks/email/notification',
job=job
)
mail.send(msg)
# endregion event_handlers
##############################################################################
# misc #
##############################################################################
# region misc
@login.user_loader
def load_user(user_id):
return User.query.get(int(user_id))
# endregion misc