Fix issues with some models, bump version

5bd82d2c · Patrick Jentsch · 31a780bb · 5bd82d2c
Commit 5bd82d2c authored 2 years ago by Patrick Jentsch
--- a/spacy-nlp
+++ b/spacy-nlp
@@ -52,14 +52,16 @@ with open(args.input_file, 'rb') as input_file:
        encoding = chardet.detect(input_file.read())['encoding']
    else:
        encoding = 'utf-8'
+    # After reading the input file with chardet, the file pointer needs to be reset
    input_file.seek(0)
    text_md5 = hashlib.md5()
    for chunk in iter(lambda: input_file.read(128 * text_md5.block_size), b''):
        text_md5.update(chunk)
 # Load the text contents from the input file
 with open(args.input_file, encoding=encoding) as input_file:
-    # spaCy NLP is limited to strings with a maximum of 1 million characters at
+    # spaCy is limited to strings with a maximum of 1 million characters at
    # once. So we split it into suitable chunks.
    text_chunks = textwrap.wrap(
        input_file.read(),
@@ -75,38 +77,48 @@ with open(args.input_file, encoding=encoding) as input_file:
 nlp = spacy.load(args.model)
-meta = {
+model_caps = []
-    'generator': {
+# token
-        'name': 'nopaque spacy NLP',
+if True:
-        'version': '0.1.0',
+    model_caps.append('token')
-        'arguments': {
+if 'token' in model_caps:
-            'check_encoding': args.check_encoding,
+    # token.lemma
-            'model': args.model
+    if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
-        }
+        model_caps.append('token.lemma')
-    },
+    # token.simple_pos
-    'file': {
+    if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
-        'encoding': encoding,
+        model_caps.append('token.simple_pos')
-        'md5': text_md5.hexdigest(),
+    # token.pos
-        'name': os.path.basename(args.input_file)
+    if nlp.has_pipe('tagger'):
-    }
+        model_caps.append('token.pos')
-}
+    # TODO: Create a check for token.sentiment
+    # if <check>:
+    #     model_caps.append('token.sentiment')
+if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
+    model_caps.append('ent')
+if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):
+    model_caps.append('s')
+if 's' in model_caps:
+    # TODO: Create a check for s.sentiment
+    # if <check>:
+    #     model_caps.append('s.sentiment')
+    pass
+# Document available tags and their properties
 tags = []
+# region token
 token = {
    'id': generate_id('token'),
    'name': 'token',
    'description': 'An individual token — i.e. a word, punctuation symbol, whitespace, etc.',  # noqa
    'properties': []
 }
-# TODO: Check if all languages support token.sentiment
+tags.append(token)
-token['properties'].append(
+# endregion token
-    {
-        'id': generate_id('token.sentiment'),
+# region token.lemma
-        'name': 'sentiment',
+if 'token.lemma' in model_caps:
-        'description': 'A scalar value indicating the positivity or negativity of the token.'  # noqa
-    }
-)
-if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
    token['properties'].append(
        {
            'id': generate_id('token.lemma'),
@@ -114,7 +126,10 @@ if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
            'description': 'The base form of the word'
        }
    )
-if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
+# endregion token.lemma
+# region token.simple_pos
+if 'token.simple_pos' in model_caps:
    token['properties'].append(
        {
            'id': generate_id('token.simple_pos'),
@@ -209,7 +224,10 @@ if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
            ]
        }
    )
-if nlp.has_pipe('tagger'):
+# endregion token.simple_pos
+# region token.pos
+if 'token.pos' in model_caps:
    token['properties'].append(
        {
            'id': generate_id('token.pos'),
@@ -224,7 +242,116 @@ if nlp.has_pipe('tagger'):
            ]
        }
    )
-if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
+elif 'simple_pos' in model_caps:
+    token['properties'].append(
+        {
+            'id': generate_id('token.pos'),
+            'name': 'pos',
+            'description': 'The detailed part-of-speech tag',
+            'labels': [
+                {
+                    'id': generate_id('token.pos=ADJ'),
+                    'name': 'ADJ',
+                    'description': 'adjective'
+                },
+                {
+                    'id': generate_id('token.pos=ADP'),
+                    'name': 'ADP',
+                    'description': 'adposition'
+                },
+                {
+                    'id': generate_id('token.pos=ADV'),
+                    'name': 'ADV',
+                    'description': 'adverb'
+                },
+                {
+                    'id': generate_id('token.pos=AUX'),
+                    'name': 'AUX',
+                    'description': 'auxiliary verb'
+                },
+                {
+                    'id': generate_id('token.pos=CONJ'),
+                    'name': 'CONJ',
+                    'description': 'coordinating conjunction'
+                },
+                {
+                    'id': generate_id('token.pos=DET'),
+                    'name': 'DET',
+                    'description': 'determiner'
+                },
+                {
+                    'id': generate_id('token.pos=INTJ'),
+                    'name': 'INTJ',
+                    'description': 'interjection'
+                },
+                {
+                    'id': generate_id('token.pos=NOUN'),
+                    'name': 'NOUN',
+                    'description': 'noun'
+                },
+                {
+                    'id': generate_id('token.pos=NUM'),
+                    'name': 'NUM',
+                    'description': 'numeral'
+                },
+                {
+                    'id': generate_id('token.pos=PART'),
+                    'name': 'PART',
+                    'description': 'particle'
+                },
+                {
+                    'id': generate_id('token.pos=PRON'),
+                    'name': 'PRON',
+                    'description': 'pronoun'
+                },
+                {
+                    'id': generate_id('token.pos=PROPN'),
+                    'name': 'PROPN',
+                    'description': 'proper noun'
+                },
+                {
+                    'id': generate_id('token.pos=PUNCT'),
+                    'name': 'PUNCT',
+                    'description': 'punctuation'
+                },
+                {
+                    'id': generate_id('token.pos=SCONJ'),
+                    'name': 'SCONJ',
+                    'description': 'subordinating conjunction'
+                },
+                {
+                    'id': generate_id('token.pos=SYM'),
+                    'name': 'SYM',
+                    'description': 'symbol'
+                },
+                {
+                    'id': generate_id('token.pos=VERB'),
+                    'name': 'VERB',
+                    'description': 'verb'
+                },
+                {
+                    'id': generate_id('token.pos=X'),
+                    'name': 'X',
+                    'description': 'other'
+                }
+            ]
+        }
+    )
+# endregion token.pos
+# region token.sentiment
+# if 'token.sentiment' in model_caps:
+#     token['properties'].append(
+#         {
+#             'id': generate_id('token.sentiment'),
+#             'name': 'sentiment',
+#             'description': 'A scalar value indicating the positivity or negativity of the token.'  # noqa
+#         }
+#     )
+# endregion token.sentiment
+# region ent
+if 'ent' in model_caps:
    tags.append(
        {
            'id': generate_id('ent'),
@@ -246,23 +373,100 @@ if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
            ]
        }
    )
-if nlp.has_pipe('parser') or nlp.has_pipe('senter') or nlp.has_pipe('sentencizer'):  # noqa
+# endregion ent
+# region s
+if 's' in model_caps:
+    s = {
+        'id': generate_id('s'),
+        'name': 's',
+        'description': 'Encodes the start and end of a sentence',
+        'properties': []
+    }
+    tags.append(s)
    # TODO: Check if all languages support sent.sentiment
-    tags.append(
+# endregion s
-        {
-            'id': generate_id('s'),
+# region s.sentiment
-            'name': 's',
+# if 's.sentiment' in model_caps:
-            'description': 'Encodes the start and end of a sentence',
+#     s['properties'].append(
-            'properties': [
+#         {
-                {
+#             'id': generate_id('s.sentiment'),
-                    'id': generate_id('s.sentiment'),
+#             'name': 'sentiment',
-                    'name': 'sentiment',
+#             'description': 'A scalar value indicating the positivity or negativity of the sentence.'  # noqa
-                    'description': 'A scalar value indicating the positivity or negativity of the sentence.'  # noqa
+#         }
-                }
+#     )
-            ]
+# endregion s
-        }
-    )
-tags.append(token)
+def create_ent_annotation(ent, chunk_offset=0):
+    return {
+        'start': ent.start_char + chunk_offset,
+        'end': ent.end_char + chunk_offset,
+        'tag_id': generate_id('ent'),
+        'properties': [
+            {
+                'property_id': generate_id('ent.type'),
+                'value': ent.label_
+            }
+        ]
+    }
+def create_sent_annotation(sent, chunk_offset=0):
+    annotation = {
+        'start': sent.start_char + chunk_offset,
+        'end': sent.end_char + chunk_offset,
+        'tag_id': generate_id('s'),
+        'properties': []
+    }
+    if hasattr(sent, 'sentiment') and 's.sentiment' in model_caps:
+        annotation['properties'].append(
+            {
+                'property_id': generate_id('s.sentiment'),
+                'value': sent.sentiment
+            }
+        )
+    return annotation
+def create_token_annotation(token, chunk_offset=0):
+    annotation = {
+        'start': token.idx + chunk_offset,
+        'end': token.idx + len(token.text) + chunk_offset,
+        'tag_id': generate_id('token'),
+        'properties': []
+    }
+    if hasattr(token, 'lemma_') and 'token.lemma' in model_caps:
+        annotation['properties'].append(
+            {
+                'property_id': generate_id('token.lemma'),
+                'value': token.lemma_
+            }
+        )
+    if hasattr(token, 'pos_') and 'token.simple_pos' in model_caps:
+        annotation['properties'].append(
+            {
+                'property_id': generate_id('token.simple_pos'),
+                'value': token.pos_
+            }
+        )
+    if hasattr(token, 'sentiment') and 'token.sentiment' in model_caps:
+        annotation['properties'].append(
+            {
+                'property_id': generate_id('token.sentiment'),
+                'value': token.sentiment
+            }
+        )
+    if hasattr(token, 'tag_') and 'token.pos' in model_caps:
+        annotation['properties'].append(
+            {
+            'property_id': generate_id('token.pos'),
+            'value': token.tag_
+            }
+        )
+    return annotation
 annotations = []
@@ -270,75 +474,36 @@ chunk_offset = 0
 while text_chunks:
    text_chunk = text_chunks.pop(0)
    doc = nlp(text_chunk)
-    if hasattr(doc, 'ents'):
+    if hasattr(doc, 'ents') and 'ent' in model_caps:
        for ent in doc.ents:
-            annotation = {
+            annotations.append(create_ent_annotation(ent, chunk_offset))
-                'start': ent.start_char + chunk_offset,
+    if hasattr(doc, 'sents') and 's' in model_caps:
-                'end': ent.end_char + chunk_offset,
-                'tag_id': generate_id('ent'),
-                'properties': [
-                    {
-                        'property_id': generate_id('ent.type'),
-                        'value': ent.label_
-                    }
-                ]
-            }
-            annotations.append(annotation)
-    if hasattr(doc, 'sents'):
        for sent in doc.sents:
-            annotation = {
+            annotations.append(create_sent_annotation(sent, chunk_offset))
-                'start': sent.start_char + chunk_offset,
+    if 'token' in model_caps:
-                'end': sent.end_char + chunk_offset,
+        for token in doc:
-                'tag_id': generate_id('s'),
+            annotations.append(create_token_annotation(token, chunk_offset))
-                'properties': []
-            }
-            if hasattr(sent, 'sentiment'):
-                annotation['properties'].append(
-                    {
-                        'property_id': generate_id('s.sentiment'),
-                        'value': sent.sentiment
-                    }
-                )
-            annotations.append(annotation)
-    for token in doc:
-        annotation = {
-            'start': token.idx + chunk_offset,
-            'end': token.idx + len(token.text) + chunk_offset,
-            'tag_id': generate_id('token'),
-            'properties': []
-        }
-        if hasattr(token, 'lemma_'):
-            annotation['properties'].append(
-                {
-                    'property_id': generate_id('token.lemma'),
-                    'value': token.lemma_
-                }
-            )
-        if hasattr(token, 'pos_'):
-            annotation['properties'].append(
-                {
-                    'property_id': generate_id('token.simple_pos'),
-                    'value': token.pos_
-                }
-            )
-        if hasattr(token, 'sentiment'):
-            annotation['properties'].append(
-                {
-                    'property_id': generate_id('token.sentiment'),
-                    'value': token.sentiment
-                }
-            )
-        if hasattr(token, 'tag_'):
-            annotation['properties'].append(
-                {
-                   'property_id': generate_id('token.pos'),
-                   'value': token.tag_
-                }
-            )
-        annotations.append(annotation)
    chunk_offset += len(text_chunk)
    text_chunk = None
+meta = {
+    'generator': {
+        'name': 'nopaque spacy NLP',
+        'version': '0.1.1',
+        'arguments': {
+            'check_encoding': args.check_encoding,
+            'model': args.model
+        }
+    },
+    'file': {
+        'encoding': encoding,
+        'md5': text_md5.hexdigest(),
+        'name': os.path.basename(args.input_file)
+    }
+}
 with open(args.output_file, 'w') as output_file:
    json.dump(
        {'meta': meta, 'tags': tags, 'annotations': annotations},