Skip to content
Snippets Groups Projects
Commit 5bd82d2c authored by Patrick Jentsch's avatar Patrick Jentsch
Browse files

Fix issues with some models, bump version

parent 31a780bb
No related branches found
No related tags found
No related merge requests found
...@@ -52,14 +52,16 @@ with open(args.input_file, 'rb') as input_file: ...@@ -52,14 +52,16 @@ with open(args.input_file, 'rb') as input_file:
encoding = chardet.detect(input_file.read())['encoding'] encoding = chardet.detect(input_file.read())['encoding']
else: else:
encoding = 'utf-8' encoding = 'utf-8'
# After reading the input file with chardet, the file pointer needs to be reset
input_file.seek(0) input_file.seek(0)
text_md5 = hashlib.md5() text_md5 = hashlib.md5()
for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''): for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
text_md5.update(chunk) text_md5.update(chunk)
# Load the text contents from the input file # Load the text contents from the input file
with open(args.input_file, encoding=encoding) as input_file: with open(args.input_file, encoding=encoding) as input_file:
# spaCy NLP is limited to strings with a maximum of 1 million characters at # spaCy is limited to strings with a maximum of 1 million characters at
# once. So we split it into suitable chunks. # once. So we split it into suitable chunks.
text_chunks = textwrap.wrap( text_chunks = textwrap.wrap(
input_file.read(), input_file.read(),
...@@ -75,38 +77,48 @@ with open(args.input_file, encoding=encoding) as input_file: ...@@ -75,38 +77,48 @@ with open(args.input_file, encoding=encoding) as input_file:
nlp = spacy.load(args.model) nlp = spacy.load(args.model)
meta = { model_caps = []
'generator': { # token
'name': 'nopaque spacy NLP', if True:
'version': '0.1.0', model_caps.append('token')
'arguments': { if 'token' in model_caps:
'check_encoding': args.check_encoding, # token.lemma
'model': args.model if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
} model_caps.append('token.lemma')
}, # token.simple_pos
'file': { if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
'encoding': encoding, model_caps.append('token.simple_pos')
'md5': text_md5.hexdigest(), # token.pos
'name': os.path.basename(args.input_file) if nlp.has_pipe('tagger'):
} model_caps.append('token.pos')
} # TODO: Create a check for token.sentiment
# if <check>:
# model_caps.append('token.sentiment')
if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
model_caps.append('ent')
if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):
model_caps.append('s')
if 's' in model_caps:
# TODO: Create a check for s.sentiment
# if <check>:
# model_caps.append('s.sentiment')
pass
# Document available tags and their properties
tags = [] tags = []
# region token
token = { token = {
'id': generate_id('token'), 'id': generate_id('token'),
'name': 'token', 'name': 'token',
'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', # noqa 'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.', # noqa
'properties': [] 'properties': []
} }
# TODO: Check if all languages support token.sentiment tags.append(token)
token['properties'].append( # endregion token
{
'id': generate_id('token.sentiment'), # region token.lemma
'name': 'sentiment', if 'token.lemma' in model_caps:
'description': 'A scalar value indicating the positivity or negativity of the token.' # noqa
}
)
if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
token['properties'].append( token['properties'].append(
{ {
'id': generate_id('token.lemma'), 'id': generate_id('token.lemma'),
...@@ -114,7 +126,10 @@ if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'): ...@@ -114,7 +126,10 @@ if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
'description': 'The base form of the word' 'description': 'The base form of the word'
} }
) )
if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'): # endregion token.lemma
# region token.simple_pos
if 'token.simple_pos' in model_caps:
token['properties'].append( token['properties'].append(
{ {
'id': generate_id('token.simple_pos'), 'id': generate_id('token.simple_pos'),
...@@ -209,7 +224,10 @@ if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'): ...@@ -209,7 +224,10 @@ if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
] ]
} }
) )
if nlp.has_pipe('tagger'): # endregion token.simple_pos
# region token.pos
if 'token.pos' in model_caps:
token['properties'].append( token['properties'].append(
{ {
'id': generate_id('token.pos'), 'id': generate_id('token.pos'),
...@@ -224,7 +242,116 @@ if nlp.has_pipe('tagger'): ...@@ -224,7 +242,116 @@ if nlp.has_pipe('tagger'):
] ]
} }
) )
if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'): elif 'simple_pos' in model_caps:
token['properties'].append(
{
'id': generate_id('token.pos'),
'name': 'pos',
'description': 'The detailed part-of-speech tag',
'labels': [
{
'id': generate_id('token.pos=ADJ'),
'name': 'ADJ',
'description': 'adjective'
},
{
'id': generate_id('token.pos=ADP'),
'name': 'ADP',
'description': 'adposition'
},
{
'id': generate_id('token.pos=ADV'),
'name': 'ADV',
'description': 'adverb'
},
{
'id': generate_id('token.pos=AUX'),
'name': 'AUX',
'description': 'auxiliary verb'
},
{
'id': generate_id('token.pos=CONJ'),
'name': 'CONJ',
'description': 'coordinating conjunction'
},
{
'id': generate_id('token.pos=DET'),
'name': 'DET',
'description': 'determiner'
},
{
'id': generate_id('token.pos=INTJ'),
'name': 'INTJ',
'description': 'interjection'
},
{
'id': generate_id('token.pos=NOUN'),
'name': 'NOUN',
'description': 'noun'
},
{
'id': generate_id('token.pos=NUM'),
'name': 'NUM',
'description': 'numeral'
},
{
'id': generate_id('token.pos=PART'),
'name': 'PART',
'description': 'particle'
},
{
'id': generate_id('token.pos=PRON'),
'name': 'PRON',
'description': 'pronoun'
},
{
'id': generate_id('token.pos=PROPN'),
'name': 'PROPN',
'description': 'proper noun'
},
{
'id': generate_id('token.pos=PUNCT'),
'name': 'PUNCT',
'description': 'punctuation'
},
{
'id': generate_id('token.pos=SCONJ'),
'name': 'SCONJ',
'description': 'subordinating conjunction'
},
{
'id': generate_id('token.pos=SYM'),
'name': 'SYM',
'description': 'symbol'
},
{
'id': generate_id('token.pos=VERB'),
'name': 'VERB',
'description': 'verb'
},
{
'id': generate_id('token.pos=X'),
'name': 'X',
'description': 'other'
}
]
}
)
# endregion token.pos
# region token.sentiment
# if 'token.sentiment' in model_caps:
# token['properties'].append(
# {
# 'id': generate_id('token.sentiment'),
# 'name': 'sentiment',
# 'description': 'A scalar value indicating the positivity or negativity of the token.' # noqa
# }
# )
# endregion token.sentiment
# region ent
if 'ent' in model_caps:
tags.append( tags.append(
{ {
'id': generate_id('ent'), 'id': generate_id('ent'),
...@@ -246,23 +373,100 @@ if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'): ...@@ -246,23 +373,100 @@ if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
] ]
} }
) )
if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'): # noqa # endregion ent
# region s
if 's' in model_caps:
s = {
'id': generate_id('s'),
'name': 's',
'description': 'Encodes the start and end of a sentence',
'properties': []
}
tags.append(s)
# TODO: Check if all languages support sent.sentiment # TODO: Check if all languages support sent.sentiment
tags.append( # endregion s
{
'id': generate_id('s'), # region s.sentiment
'name': 's', # if 's.sentiment' in model_caps:
'description': 'Encodes the start and end of a sentence', # s['properties'].append(
'properties': [ # {
{ # 'id': generate_id('s.sentiment'),
'id': generate_id('s.sentiment'), # 'name': 'sentiment',
'name': 'sentiment', # 'description': 'A scalar value indicating the positivity or negativity of the sentence.' # noqa
'description': 'A scalar value indicating the positivity or negativity of the sentence.' # noqa # }
} # )
] # endregion s
}
)
tags.append(token) def create_ent_annotation(ent, chunk_offset=0):
return {
'start': ent.start_char + chunk_offset,
'end': ent.end_char + chunk_offset,
'tag_id': generate_id('ent'),
'properties': [
{
'property_id': generate_id('ent.type'),
'value': ent.label_
}
]
}
def create_sent_annotation(sent, chunk_offset=0):
annotation = {
'start': sent.start_char + chunk_offset,
'end': sent.end_char + chunk_offset,
'tag_id': generate_id('s'),
'properties': []
}
if hasattr(sent, 'sentiment') and 's.sentiment' in model_caps:
annotation['properties'].append(
{
'property_id': generate_id('s.sentiment'),
'value': sent.sentiment
}
)
return annotation
def create_token_annotation(token, chunk_offset=0):
annotation = {
'start': token.idx + chunk_offset,
'end': token.idx + len(token.text) + chunk_offset,
'tag_id': generate_id('token'),
'properties': []
}
if hasattr(token, 'lemma_') and 'token.lemma' in model_caps:
annotation['properties'].append(
{
'property_id': generate_id('token.lemma'),
'value': token.lemma_
}
)
if hasattr(token, 'pos_') and 'token.simple_pos' in model_caps:
annotation['properties'].append(
{
'property_id': generate_id('token.simple_pos'),
'value': token.pos_
}
)
if hasattr(token, 'sentiment') and 'token.sentiment' in model_caps:
annotation['properties'].append(
{
'property_id': generate_id('token.sentiment'),
'value': token.sentiment
}
)
if hasattr(token, 'tag_') and 'token.pos' in model_caps:
annotation['properties'].append(
{
'property_id': generate_id('token.pos'),
'value': token.tag_
}
)
return annotation
annotations = [] annotations = []
...@@ -270,75 +474,36 @@ chunk_offset = 0 ...@@ -270,75 +474,36 @@ chunk_offset = 0
while text_chunks: while text_chunks:
text_chunk = text_chunks.pop(0) text_chunk = text_chunks.pop(0)
doc = nlp(text_chunk) doc = nlp(text_chunk)
if hasattr(doc, 'ents'): if hasattr(doc, 'ents') and 'ent' in model_caps:
for ent in doc.ents: for ent in doc.ents:
annotation = { annotations.append(create_ent_annotation(ent, chunk_offset))
'start': ent.start_char + chunk_offset, if hasattr(doc, 'sents') and 's' in model_caps:
'end': ent.end_char + chunk_offset,
'tag_id': generate_id('ent'),
'properties': [
{
'property_id': generate_id('ent.type'),
'value': ent.label_
}
]
}
annotations.append(annotation)
if hasattr(doc, 'sents'):
for sent in doc.sents: for sent in doc.sents:
annotation = { annotations.append(create_sent_annotation(sent, chunk_offset))
'start': sent.start_char + chunk_offset, if 'token' in model_caps:
'end': sent.end_char + chunk_offset, for token in doc:
'tag_id': generate_id('s'), annotations.append(create_token_annotation(token, chunk_offset))
'properties': []
}
if hasattr(sent, 'sentiment'):
annotation['properties'].append(
{
'property_id': generate_id('s.sentiment'),
'value': sent.sentiment
}
)
annotations.append(annotation)
for token in doc:
annotation = {
'start': token.idx + chunk_offset,
'end': token.idx + len(token.text) + chunk_offset,
'tag_id': generate_id('token'),
'properties': []
}
if hasattr(token, 'lemma_'):
annotation['properties'].append(
{
'property_id': generate_id('token.lemma'),
'value': token.lemma_
}
)
if hasattr(token, 'pos_'):
annotation['properties'].append(
{
'property_id': generate_id('token.simple_pos'),
'value': token.pos_
}
)
if hasattr(token, 'sentiment'):
annotation['properties'].append(
{
'property_id': generate_id('token.sentiment'),
'value': token.sentiment
}
)
if hasattr(token, 'tag_'):
annotation['properties'].append(
{
'property_id': generate_id('token.pos'),
'value': token.tag_
}
)
annotations.append(annotation)
chunk_offset += len(text_chunk) chunk_offset += len(text_chunk)
text_chunk = None text_chunk = None
meta = {
'generator': {
'name': 'nopaque spacy NLP',
'version': '0.1.1',
'arguments': {
'check_encoding': args.check_encoding,
'model': args.model
}
},
'file': {
'encoding': encoding,
'md5': text_md5.hexdigest(),
'name': os.path.basename(args.input_file)
}
}
with open(args.output_file, 'w') as output_file: with open(args.output_file, 'w') as output_file:
json.dump( json.dump(
{'meta': meta, 'tags': tags, 'annotations': annotations}, {'meta': meta, 'tags': tags, 'annotations': annotations},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment