Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
spaCy NLP Pipeline
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
Looking for advice? Join the
Matrix channel for GitLab users in Bielefeld
!
Show more breadcrumbs
SFB 1288 - INF
spaCy NLP Pipeline
Commits
5bd82d2c
Commit
5bd82d2c
authored
2 years ago
by
Patrick Jentsch
Browse files
Options
Downloads
Patches
Plain Diff
Fix issues with some models, bump version
parent
31a780bb
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
spacy-nlp
+273
-108
273 additions, 108 deletions
spacy-nlp
with
273 additions
and
108 deletions
spacy-nlp
+
273
−
108
View file @
5bd82d2c
...
@@ -52,14 +52,16 @@ with open(args.input_file, 'rb') as input_file:
...
@@ -52,14 +52,16 @@ with open(args.input_file, 'rb') as input_file:
encoding
=
chardet
.
detect
(
input_file
.
read
())[
'
encoding
'
]
encoding
=
chardet
.
detect
(
input_file
.
read
())[
'
encoding
'
]
else
:
else
:
encoding
=
'
utf-8
'
encoding
=
'
utf-8
'
# After reading the input file with chardet, the file pointer needs to be reset
input_file
.
seek
(
0
)
input_file
.
seek
(
0
)
text_md5
=
hashlib
.
md5
()
text_md5
=
hashlib
.
md5
()
for
chunk
in
iter
(
lambda
:
input_file
.
read
(
128
*
text_md5
.
block_size
),
b
''
):
for
chunk
in
iter
(
lambda
:
input_file
.
read
(
128
*
text_md5
.
block_size
),
b
''
):
text_md5
.
update
(
chunk
)
text_md5
.
update
(
chunk
)
# Load the text contents from the input file
# Load the text contents from the input file
with
open
(
args
.
input_file
,
encoding
=
encoding
)
as
input_file
:
with
open
(
args
.
input_file
,
encoding
=
encoding
)
as
input_file
:
# spaCy
NLP
is limited to strings with a maximum of 1 million characters at
# spaCy is limited to strings with a maximum of 1 million characters at
# once. So we split it into suitable chunks.
# once. So we split it into suitable chunks.
text_chunks
=
textwrap
.
wrap
(
text_chunks
=
textwrap
.
wrap
(
input_file
.
read
(),
input_file
.
read
(),
...
@@ -75,38 +77,48 @@ with open(args.input_file, encoding=encoding) as input_file:
...
@@ -75,38 +77,48 @@ with open(args.input_file, encoding=encoding) as input_file:
nlp
=
spacy
.
load
(
args
.
model
)
nlp
=
spacy
.
load
(
args
.
model
)
meta
=
{
model_caps
=
[]
'
generator
'
:
{
# token
'
name
'
:
'
nopaque spacy NLP
'
,
if
True
:
'
version
'
:
'
0.1.0
'
,
model_caps
.
append
(
'
token
'
)
'
arguments
'
:
{
if
'
token
'
in
model_caps
:
'
check_encoding
'
:
args
.
check_encoding
,
# token.lemma
'
model
'
:
args
.
model
if
nlp
.
has_pipe
(
'
lemmatizer
'
)
or
nlp
.
has_pipe
(
'
trainable_lemmatizer
'
):
}
model_caps
.
append
(
'
token.lemma
'
)
},
# token.simple_pos
'
file
'
:
{
if
nlp
.
has_pipe
(
'
morphologizer
'
)
or
nlp
.
has_pipe
(
'
tagger
'
):
'
encoding
'
:
encoding
,
model_caps
.
append
(
'
token.simple_pos
'
)
'
md5
'
:
text_md5
.
hexdigest
(),
# token.pos
'
name
'
:
os
.
path
.
basename
(
args
.
input_file
)
if
nlp
.
has_pipe
(
'
tagger
'
):
}
model_caps
.
append
(
'
token.pos
'
)
}
# TODO: Create a check for token.sentiment
# if <check>:
# model_caps.append('token.sentiment')
if
nlp
.
has_pipe
(
'
ner
'
)
or
nlp
.
has_pipe
(
'
entity_ruler
'
):
model_caps
.
append
(
'
ent
'
)
if
nlp
.
has_pipe
(
'
parser
'
)
or
nlp
.
has_pipe
(
'
senter
'
)
or
nlp
.
has_pipe
(
'
sentencizer
'
):
model_caps
.
append
(
'
s
'
)
if
'
s
'
in
model_caps
:
# TODO: Create a check for s.sentiment
# if <check>:
# model_caps.append('s.sentiment')
pass
# Document available tags and their properties
tags
=
[]
tags
=
[]
# region token
token
=
{
token
=
{
'
id
'
:
generate_id
(
'
token
'
),
'
id
'
:
generate_id
(
'
token
'
),
'
name
'
:
'
token
'
,
'
name
'
:
'
token
'
,
'
description
'
:
'
An individual token — i.e. a word, punctuation symbol, whitespace, etc.
'
,
# noqa
'
description
'
:
'
An individual token — i.e. a word, punctuation symbol, whitespace, etc.
'
,
# noqa
'
properties
'
:
[]
'
properties
'
:
[]
}
}
# TODO: Check if all languages support token.sentiment
tags
.
append
(
token
)
token
[
'
properties
'
].
append
(
# endregion token
{
'
id
'
:
generate_id
(
'
token.sentiment
'
),
# region token.lemma
'
name
'
:
'
sentiment
'
,
if
'
token.lemma
'
in
model_caps
:
'
description
'
:
'
A scalar value indicating the positivity or negativity of the token.
'
# noqa
}
)
if
nlp
.
has_pipe
(
'
lemmatizer
'
)
or
nlp
.
has_pipe
(
'
trainable_lemmatizer
'
):
token
[
'
properties
'
].
append
(
token
[
'
properties
'
].
append
(
{
{
'
id
'
:
generate_id
(
'
token.lemma
'
),
'
id
'
:
generate_id
(
'
token.lemma
'
),
...
@@ -114,7 +126,10 @@ if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
...
@@ -114,7 +126,10 @@ if nlp.has_pipe('lemmatizer') or nlp.has_pipe('trainable_lemmatizer'):
'
description
'
:
'
The base form of the word
'
'
description
'
:
'
The base form of the word
'
}
}
)
)
if
nlp
.
has_pipe
(
'
morphologizer
'
)
or
nlp
.
has_pipe
(
'
tagger
'
):
# endregion token.lemma
# region token.simple_pos
if
'
token.simple_pos
'
in
model_caps
:
token
[
'
properties
'
].
append
(
token
[
'
properties
'
].
append
(
{
{
'
id
'
:
generate_id
(
'
token.simple_pos
'
),
'
id
'
:
generate_id
(
'
token.simple_pos
'
),
...
@@ -209,7 +224,10 @@ if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
...
@@ -209,7 +224,10 @@ if nlp.has_pipe('morphologizer') or nlp.has_pipe('tagger'):
]
]
}
}
)
)
if
nlp
.
has_pipe
(
'
tagger
'
):
# endregion token.simple_pos
# region token.pos
if
'
token.pos
'
in
model_caps
:
token
[
'
properties
'
].
append
(
token
[
'
properties
'
].
append
(
{
{
'
id
'
:
generate_id
(
'
token.pos
'
),
'
id
'
:
generate_id
(
'
token.pos
'
),
...
@@ -224,7 +242,116 @@ if nlp.has_pipe('tagger'):
...
@@ -224,7 +242,116 @@ if nlp.has_pipe('tagger'):
]
]
}
}
)
)
if
nlp
.
has_pipe
(
'
ner
'
)
or
nlp
.
has_pipe
(
'
entity_ruler
'
):
elif
'
simple_pos
'
in
model_caps
:
token
[
'
properties
'
].
append
(
{
'
id
'
:
generate_id
(
'
token.pos
'
),
'
name
'
:
'
pos
'
,
'
description
'
:
'
The detailed part-of-speech tag
'
,
'
labels
'
:
[
{
'
id
'
:
generate_id
(
'
token.pos=ADJ
'
),
'
name
'
:
'
ADJ
'
,
'
description
'
:
'
adjective
'
},
{
'
id
'
:
generate_id
(
'
token.pos=ADP
'
),
'
name
'
:
'
ADP
'
,
'
description
'
:
'
adposition
'
},
{
'
id
'
:
generate_id
(
'
token.pos=ADV
'
),
'
name
'
:
'
ADV
'
,
'
description
'
:
'
adverb
'
},
{
'
id
'
:
generate_id
(
'
token.pos=AUX
'
),
'
name
'
:
'
AUX
'
,
'
description
'
:
'
auxiliary verb
'
},
{
'
id
'
:
generate_id
(
'
token.pos=CONJ
'
),
'
name
'
:
'
CONJ
'
,
'
description
'
:
'
coordinating conjunction
'
},
{
'
id
'
:
generate_id
(
'
token.pos=DET
'
),
'
name
'
:
'
DET
'
,
'
description
'
:
'
determiner
'
},
{
'
id
'
:
generate_id
(
'
token.pos=INTJ
'
),
'
name
'
:
'
INTJ
'
,
'
description
'
:
'
interjection
'
},
{
'
id
'
:
generate_id
(
'
token.pos=NOUN
'
),
'
name
'
:
'
NOUN
'
,
'
description
'
:
'
noun
'
},
{
'
id
'
:
generate_id
(
'
token.pos=NUM
'
),
'
name
'
:
'
NUM
'
,
'
description
'
:
'
numeral
'
},
{
'
id
'
:
generate_id
(
'
token.pos=PART
'
),
'
name
'
:
'
PART
'
,
'
description
'
:
'
particle
'
},
{
'
id
'
:
generate_id
(
'
token.pos=PRON
'
),
'
name
'
:
'
PRON
'
,
'
description
'
:
'
pronoun
'
},
{
'
id
'
:
generate_id
(
'
token.pos=PROPN
'
),
'
name
'
:
'
PROPN
'
,
'
description
'
:
'
proper noun
'
},
{
'
id
'
:
generate_id
(
'
token.pos=PUNCT
'
),
'
name
'
:
'
PUNCT
'
,
'
description
'
:
'
punctuation
'
},
{
'
id
'
:
generate_id
(
'
token.pos=SCONJ
'
),
'
name
'
:
'
SCONJ
'
,
'
description
'
:
'
subordinating conjunction
'
},
{
'
id
'
:
generate_id
(
'
token.pos=SYM
'
),
'
name
'
:
'
SYM
'
,
'
description
'
:
'
symbol
'
},
{
'
id
'
:
generate_id
(
'
token.pos=VERB
'
),
'
name
'
:
'
VERB
'
,
'
description
'
:
'
verb
'
},
{
'
id
'
:
generate_id
(
'
token.pos=X
'
),
'
name
'
:
'
X
'
,
'
description
'
:
'
other
'
}
]
}
)
# endregion token.pos
# region token.sentiment
# if 'token.sentiment' in model_caps:
# token['properties'].append(
# {
# 'id': generate_id('token.sentiment'),
# 'name': 'sentiment',
# 'description': 'A scalar value indicating the positivity or negativity of the token.' # noqa
# }
# )
# endregion token.sentiment
# region ent
if
'
ent
'
in
model_caps
:
tags
.
append
(
tags
.
append
(
{
{
'
id
'
:
generate_id
(
'
ent
'
),
'
id
'
:
generate_id
(
'
ent
'
),
...
@@ -246,23 +373,100 @@ if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
...
@@ -246,23 +373,100 @@ if nlp.has_pipe('ner') or nlp.has_pipe('entity_ruler'):
]
]
}
}
)
)
if
nlp
.
has_pipe
(
'
parser
'
)
or
nlp
.
has_pipe
(
'
senter
'
)
or
nlp
.
has_pipe
(
'
sentencizer
'
):
# noqa
# endregion ent
# region s
if
'
s
'
in
model_caps
:
s
=
{
'
id
'
:
generate_id
(
'
s
'
),
'
name
'
:
'
s
'
,
'
description
'
:
'
Encodes the start and end of a sentence
'
,
'
properties
'
:
[]
}
tags
.
append
(
s
)
# TODO: Check if all languages support sent.sentiment
# TODO: Check if all languages support sent.sentiment
tags
.
append
(
# endregion s
{
'
id
'
:
generate_id
(
'
s
'
),
# region s.sentiment
'
name
'
:
'
s
'
,
# if 's.sentiment' in model_caps:
'
description
'
:
'
Encodes the start and end of a sentence
'
,
# s['properties'].append(
'
properties
'
:
[
# {
{
# 'id': generate_id('s.sentiment'),
'
id
'
:
generate_id
(
'
s.sentiment
'
),
# 'name': 'sentiment',
'
name
'
:
'
sentiment
'
,
# 'description': 'A scalar value indicating the positivity or negativity of the sentence.' # noqa
'
description
'
:
'
A scalar value indicating the positivity or negativity of the sentence.
'
# noqa
# }
}
# )
]
# endregion s
}
)
tags
.
append
(
token
)
def
create_ent_annotation
(
ent
,
chunk_offset
=
0
):
return
{
'
start
'
:
ent
.
start_char
+
chunk_offset
,
'
end
'
:
ent
.
end_char
+
chunk_offset
,
'
tag_id
'
:
generate_id
(
'
ent
'
),
'
properties
'
:
[
{
'
property_id
'
:
generate_id
(
'
ent.type
'
),
'
value
'
:
ent
.
label_
}
]
}
def
create_sent_annotation
(
sent
,
chunk_offset
=
0
):
annotation
=
{
'
start
'
:
sent
.
start_char
+
chunk_offset
,
'
end
'
:
sent
.
end_char
+
chunk_offset
,
'
tag_id
'
:
generate_id
(
'
s
'
),
'
properties
'
:
[]
}
if
hasattr
(
sent
,
'
sentiment
'
)
and
'
s.sentiment
'
in
model_caps
:
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
s.sentiment
'
),
'
value
'
:
sent
.
sentiment
}
)
return
annotation
def
create_token_annotation
(
token
,
chunk_offset
=
0
):
annotation
=
{
'
start
'
:
token
.
idx
+
chunk_offset
,
'
end
'
:
token
.
idx
+
len
(
token
.
text
)
+
chunk_offset
,
'
tag_id
'
:
generate_id
(
'
token
'
),
'
properties
'
:
[]
}
if
hasattr
(
token
,
'
lemma_
'
)
and
'
token.lemma
'
in
model_caps
:
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.lemma
'
),
'
value
'
:
token
.
lemma_
}
)
if
hasattr
(
token
,
'
pos_
'
)
and
'
token.simple_pos
'
in
model_caps
:
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.simple_pos
'
),
'
value
'
:
token
.
pos_
}
)
if
hasattr
(
token
,
'
sentiment
'
)
and
'
token.sentiment
'
in
model_caps
:
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.sentiment
'
),
'
value
'
:
token
.
sentiment
}
)
if
hasattr
(
token
,
'
tag_
'
)
and
'
token.pos
'
in
model_caps
:
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.pos
'
),
'
value
'
:
token
.
tag_
}
)
return
annotation
annotations
=
[]
annotations
=
[]
...
@@ -270,75 +474,36 @@ chunk_offset = 0
...
@@ -270,75 +474,36 @@ chunk_offset = 0
while
text_chunks
:
while
text_chunks
:
text_chunk
=
text_chunks
.
pop
(
0
)
text_chunk
=
text_chunks
.
pop
(
0
)
doc
=
nlp
(
text_chunk
)
doc
=
nlp
(
text_chunk
)
if
hasattr
(
doc
,
'
ents
'
):
if
hasattr
(
doc
,
'
ents
'
)
and
'
ent
'
in
model_caps
:
for
ent
in
doc
.
ents
:
for
ent
in
doc
.
ents
:
annotation
=
{
annotations
.
append
(
create_ent_annotation
(
ent
,
chunk_offset
))
'
start
'
:
ent
.
start_char
+
chunk_offset
,
if
hasattr
(
doc
,
'
sents
'
)
and
'
s
'
in
model_caps
:
'
end
'
:
ent
.
end_char
+
chunk_offset
,
'
tag_id
'
:
generate_id
(
'
ent
'
),
'
properties
'
:
[
{
'
property_id
'
:
generate_id
(
'
ent.type
'
),
'
value
'
:
ent
.
label_
}
]
}
annotations
.
append
(
annotation
)
if
hasattr
(
doc
,
'
sents
'
):
for
sent
in
doc
.
sents
:
for
sent
in
doc
.
sents
:
annotation
=
{
annotations
.
append
(
create_sent_annotation
(
sent
,
chunk_offset
))
'
start
'
:
sent
.
start_char
+
chunk_offset
,
if
'
token
'
in
model_caps
:
'
end
'
:
sent
.
end_char
+
chunk_offset
,
for
token
in
doc
:
'
tag_id
'
:
generate_id
(
'
s
'
),
annotations
.
append
(
create_token_annotation
(
token
,
chunk_offset
))
'
properties
'
:
[]
}
if
hasattr
(
sent
,
'
sentiment
'
):
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
s.sentiment
'
),
'
value
'
:
sent
.
sentiment
}
)
annotations
.
append
(
annotation
)
for
token
in
doc
:
annotation
=
{
'
start
'
:
token
.
idx
+
chunk_offset
,
'
end
'
:
token
.
idx
+
len
(
token
.
text
)
+
chunk_offset
,
'
tag_id
'
:
generate_id
(
'
token
'
),
'
properties
'
:
[]
}
if
hasattr
(
token
,
'
lemma_
'
):
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.lemma
'
),
'
value
'
:
token
.
lemma_
}
)
if
hasattr
(
token
,
'
pos_
'
):
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.simple_pos
'
),
'
value
'
:
token
.
pos_
}
)
if
hasattr
(
token
,
'
sentiment
'
):
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.sentiment
'
),
'
value
'
:
token
.
sentiment
}
)
if
hasattr
(
token
,
'
tag_
'
):
annotation
[
'
properties
'
].
append
(
{
'
property_id
'
:
generate_id
(
'
token.pos
'
),
'
value
'
:
token
.
tag_
}
)
annotations
.
append
(
annotation
)
chunk_offset
+=
len
(
text_chunk
)
chunk_offset
+=
len
(
text_chunk
)
text_chunk
=
None
text_chunk
=
None
meta
=
{
'
generator
'
:
{
'
name
'
:
'
nopaque spacy NLP
'
,
'
version
'
:
'
0.1.1
'
,
'
arguments
'
:
{
'
check_encoding
'
:
args
.
check_encoding
,
'
model
'
:
args
.
model
}
},
'
file
'
:
{
'
encoding
'
:
encoding
,
'
md5
'
:
text_md5
.
hexdigest
(),
'
name
'
:
os
.
path
.
basename
(
args
.
input_file
)
}
}
with
open
(
args
.
output_file
,
'
w
'
)
as
output_file
:
with
open
(
args
.
output_file
,
'
w
'
)
as
output_file
:
json
.
dump
(
json
.
dump
(
{
'
meta
'
:
meta
,
'
tags
'
:
tags
,
'
annotations
'
:
annotations
},
{
'
meta
'
:
meta
,
'
tags
'
:
tags
,
'
annotations
'
:
annotations
},
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment