Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
nopaque
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Monitor
Service Desk
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SFB 1288 - INF
nopaque
Commits
4146e378
Commit
4146e378
authored
2 years ago
by
Patrick Jentsch
Browse files
Options
Downloads
Patches
Plain Diff
normalize vrt on build
parent
99ddd2e3
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
app/converters/sandpaper.py
+15
-122
15 additions, 122 deletions
app/converters/sandpaper.py
app/converters/vrt.py
+117
-0
117 additions, 0 deletions
app/converters/vrt.py
app/models.py
+8
-1
8 additions, 1 deletion
app/models.py
with
140 additions
and
123 deletions
app/converters/sandpaper.py
+
15
−
122
View file @
4146e378
...
...
@@ -4,17 +4,18 @@ from app.models import User, Corpus, CorpusFile
from
datetime
import
datetime
import
json
import
os
import
shutil
def
convert
(
json_db_file
,
data_dir
):
with
open
(
json_db_file
,
'
r
'
)
as
f
:
json_db
=
json
.
loads
(
f
.
read
())
for
json_user
in
json_db
:
if
not
json_user
[
'
confirmed
'
]:
current_app
.
logger
.
info
(
f
'
Skip unconfirmed user
{
json_user
[
"
username
"
]
}
'
)
continue
user_dir
=
os
.
path
.
join
(
data_dir
,
json_user
[
'
id
'
])
user_dir
=
os
.
path
.
join
(
data_dir
,
str
(
json_user
[
'
id
'
])
)
convert_user
(
json_user
,
user_dir
)
db
.
session
.
commit
()
...
...
@@ -42,7 +43,7 @@ def convert_user(json_user, user_dir):
if
not
json_corpus
[
'
files
'
].
values
():
current_app
.
logger
.
info
(
f
'
Skip empty corpus
{
json_corpus
[
"
title
"
]
}
'
)
continue
corpus_dir
=
os
.
path
.
join
(
user_dir
,
'
corpora
'
,
json_corpus
[
'
id
'
])
corpus_dir
=
os
.
path
.
join
(
user_dir
,
'
corpora
'
,
str
(
json_corpus
[
'
id
'
])
)
convert_corpus
(
json_corpus
,
user
,
corpus_dir
)
current_app
.
logger
.
info
(
'
Done
'
)
...
...
@@ -66,12 +67,11 @@ def convert_corpus(json_corpus, user, corpus_dir):
db
.
session
.
rollback
()
raise
Exception
(
'
Internal Server Error
'
)
for
json_corpus_file
in
json_corpus
[
'
files
'
].
values
():
corpus_file_dir
=
os
.
path
.
join
(
corpus_dir
,
'
files
'
,
json_corpus_file
[
'
id
'
])
convert_corpus_file
(
json_corpus_file
,
corpus
,
corpus_file_dir
)
convert_corpus_file
(
json_corpus_file
,
corpus
,
corpus_dir
)
current_app
.
logger
.
info
(
'
Done
'
)
def
convert_corpus_file
(
json_corpus_file
,
corpus
,
corpus_
file_
dir
):
def
convert_corpus_file
(
json_corpus_file
,
corpus
,
corpus_dir
):
current_app
.
logger
.
info
(
f
'
Create CorpusFile
{
json_corpus_file
[
"
title
"
]
}
...
'
)
corpus_file
=
CorpusFile
(
corpus
=
corpus
,
...
...
@@ -94,122 +94,15 @@ def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir):
db
.
session
.
flush
(
objects
=
[
corpus_file
])
db
.
session
.
refresh
(
corpus_file
)
try
:
convert_vrt
(
os
.
path
.
join
(
corpus_
file_
dir
,
json_corpus_file
[
'
filename
'
]),
shutil
.
copy2
(
os
.
path
.
join
(
corpus_dir
,
json_corpus_file
[
'
filename
'
]),
corpus_file
.
path
)
except
OSError
as
e
:
current_app
.
logger
.
error
(
e
)
db
.
session
.
rollback
()
raise
Exception
(
'
Internal Server Error
'
)
except
:
current_app
.
logger
.
warning
(
'
Can not convert corpus file:
'
f
'
{
os
.
path
.
join
(
corpus_dir
,
json_corpus_file
[
"
filename
"
])
}
'
'
->
'
f
'
{
corpus_file
.
path
}
'
)
current_app
.
logger
.
info
(
'
Done
'
)
def
convert_vrt
(
input_file
,
output_file
):
def
check_pos_attribute_order
(
vrt_lines
):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS
=
[
'
ADJ
'
,
'
ADP
'
,
'
ADV
'
,
'
AUX
'
,
'
CONJ
'
,
'
DET
'
,
'
INTJ
'
,
'
NOUN
'
,
'
NUM
'
,
'
PART
'
,
'
PRON
'
,
'
PROPN
'
,
'
PUNCT
'
,
'
SCONJ
'
,
'
SYM
'
,
'
VERB
'
,
'
X
'
]
for
line
in
vrt_lines
:
if
line
.
startswith
(
'
<
'
):
continue
pos_attrs
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
num_pos_attrs
=
len
(
pos_attrs
)
if
num_pos_attrs
==
4
:
if
pos_attrs
[
3
]
in
SIMPLE_POS_LABELS
:
return
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
]
continue
elif
num_pos_attrs
==
5
:
if
pos_attrs
[
2
]
in
SIMPLE_POS_LABELS
:
return
[
'
word
'
,
'
lemma
'
,
'
simple_pos
'
,
'
pos
'
,
'
ner
'
]
elif
pos_attrs
[
3
]
in
SIMPLE_POS_LABELS
:
return
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
,
'
ner
'
]
continue
return
None
def
check_has_ent_as_s_attr
(
vrt_lines
):
for
line
in
vrt_lines
:
if
line
.
startswith
(
'
<ent
'
):
return
True
return
False
def
pos_attrs_to_string_1
(
pos_attrs
):
return
f
'
{
pos_attrs
[
0
]
}
\t
{
pos_attrs
[
3
]
}
\t
{
pos_attrs
[
1
]
}
\t
{
pos_attrs
[
2
]
}
\n
'
def
pos_attrs_to_string_2
(
pos_attrs
):
return
f
'
{
pos_attrs
[
0
]
}
\t
{
pos_attrs
[
1
]
}
\t
{
pos_attrs
[
2
]
}
\t
{
pos_attrs
[
3
]
}
\n
'
with
open
(
input_file
)
as
f
:
input_vrt_lines
=
f
.
readlines
()
pos_attr_order
=
check_pos_attribute_order
(
input_vrt_lines
)
has_ent_as_s_attr
=
check_has_ent_as_s_attr
(
input_vrt_lines
)
print
(
f
'
Detected pos_attr_order: [
{
"
,
"
.
join
(
pos_attr_order
)
}
]
'
)
print
(
f
'
Detected has_ent_as_s_attr:
{
has_ent_as_s_attr
}
'
)
if
pos_attr_order
==
[
'
word
'
,
'
lemma
'
,
'
simple_pos
'
,
'
pos
'
,
'
ner
'
]:
pos_attrs_to_string_function
=
pos_attrs_to_string_1
elif
pos_attr_order
==
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
,
'
ner
'
]:
pos_attrs_to_string_function
=
pos_attrs_to_string_2
elif
pos_attr_order
==
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
]:
pos_attrs_to_string_function
=
pos_attrs_to_string_2
else
:
raise
Exception
(
'
Can not handle format
'
)
current_ent
=
None
output_vrt
=
''
for
line
in
input_vrt_lines
:
if
line
.
strip
()
==
''
:
continue
if
line
.
startswith
(
'
<
'
):
if
not
has_ent_as_s_attr
:
if
current_ent
is
not
None
:
output_vrt
+=
'
</ent>
\n
'
current_ent
=
None
if
(
line
.
startswith
(
'
<corpus
'
)
or
line
.
startswith
(
'
</corpus
'
)
or
line
.
startswith
(
'
<nlp
'
)
):
continue
elif
line
.
startswith
(
'
<text
'
):
output_vrt
+=
'
<text>
\n
'
continue
elif
line
.
startswith
(
'
<s
'
):
output_vrt
+=
'
<s>
\n
'
continue
output_vrt
+=
line
continue
pos_attrs
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
if
not
has_ent_as_s_attr
:
if
pos_attrs
[
4
].
lower
()
in
[
'
null
'
,
'
none
'
]:
if
current_ent
:
output_vrt
+=
'
</ent>
\n
'
current_ent
=
None
else
:
if
current_ent
is
None
:
output_vrt
+=
f
'
<ent type=
"
{
pos_attrs
[
4
]
}
"
>
\n
'
current_ent
=
pos_attrs
[
4
]
elif
current_ent
!=
pos_attrs
[
4
]:
output_vrt
+=
'
</ent>
\n
'
current_ent
=
None
output_vrt
+=
f
'
<ent type=
"
{
pos_attrs
[
4
]
}
"
>
\n
'
current_ent
=
pos_attrs
[
4
]
output_vrt
+=
pos_attrs_to_string_function
(
pos_attrs
)
with
open
(
output_file
,
'
w
'
)
as
f
:
f
.
write
(
output_vrt
)
This diff is collapsed.
Click to expand it.
app/converters/vrt.py
0 → 100644
+
117
−
0
View file @
4146e378
from
flask
import
current_app
def
normalize_vrt_file
(
input_file
,
output_file
):
def
check_pos_attribute_order
(
vrt_lines
):
# The following orders are possible:
# since 26.02.2019: 'word,lemma,simple_pos,pos,ner'
# since 26.03.2021: 'word,pos,lemma,simple_pos,ner'
# since 27.01.2022: 'word,pos,lemma,simple_pos'
# This Function tries to find out which order we have by looking at the
# number of attributes and the position of the simple_pos attribute
SIMPLE_POS_LABELS
=
[
'
ADJ
'
,
'
ADP
'
,
'
ADV
'
,
'
AUX
'
,
'
CONJ
'
,
'
DET
'
,
'
INTJ
'
,
'
NOUN
'
,
'
NUM
'
,
'
PART
'
,
'
PRON
'
,
'
PROPN
'
,
'
PUNCT
'
,
'
SCONJ
'
,
'
SYM
'
,
'
VERB
'
,
'
X
'
]
for
line
in
vrt_lines
:
if
line
.
startswith
(
'
<
'
):
continue
pos_attrs
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
num_pos_attrs
=
len
(
pos_attrs
)
if
num_pos_attrs
==
4
:
if
pos_attrs
[
3
]
in
SIMPLE_POS_LABELS
:
return
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
]
continue
elif
num_pos_attrs
==
5
:
if
pos_attrs
[
2
]
in
SIMPLE_POS_LABELS
:
return
[
'
word
'
,
'
lemma
'
,
'
simple_pos
'
,
'
pos
'
,
'
ner
'
]
elif
pos_attrs
[
3
]
in
SIMPLE_POS_LABELS
:
return
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
,
'
ner
'
]
continue
return
None
def
check_has_ent_as_s_attr
(
vrt_lines
):
for
line
in
vrt_lines
:
if
line
.
startswith
(
'
<ent
'
):
return
True
return
False
def
pos_attrs_to_string_1
(
pos_attrs
):
return
f
'
{
pos_attrs
[
0
]
}
\t
{
pos_attrs
[
3
]
}
\t
{
pos_attrs
[
1
]
}
\t
{
pos_attrs
[
2
]
}
\n
'
def
pos_attrs_to_string_2
(
pos_attrs
):
return
f
'
{
pos_attrs
[
0
]
}
\t
{
pos_attrs
[
1
]
}
\t
{
pos_attrs
[
2
]
}
\t
{
pos_attrs
[
3
]
}
\n
'
current_app
.
logger
.
info
(
f
'
Converting
{
input_file
}
...
'
)
with
open
(
input_file
)
as
f
:
input_vrt_lines
=
f
.
readlines
()
pos_attr_order
=
check_pos_attribute_order
(
input_vrt_lines
)
has_ent_as_s_attr
=
check_has_ent_as_s_attr
(
input_vrt_lines
)
current_app
.
logger
.
info
(
f
'
Detected pos_attr_order: [
{
"
,
"
.
join
(
pos_attr_order
)
}
]
'
)
current_app
.
logger
.
info
(
f
'
Detected has_ent_as_s_attr:
{
has_ent_as_s_attr
}
'
)
if
pos_attr_order
==
[
'
word
'
,
'
lemma
'
,
'
simple_pos
'
,
'
pos
'
,
'
ner
'
]:
pos_attrs_to_string_function
=
pos_attrs_to_string_1
elif
pos_attr_order
==
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
,
'
ner
'
]:
pos_attrs_to_string_function
=
pos_attrs_to_string_2
elif
pos_attr_order
==
[
'
word
'
,
'
pos
'
,
'
lemma
'
,
'
simple_pos
'
]:
pos_attrs_to_string_function
=
pos_attrs_to_string_2
else
:
raise
Exception
(
'
Can not handle format
'
)
current_ent
=
None
multi_line_tag_definition
=
False
output_vrt
=
''
for
line
in
input_vrt_lines
:
if
line
.
strip
()
==
''
:
continue
if
line
.
startswith
(
'
<
'
):
if
not
has_ent_as_s_attr
:
if
current_ent
is
not
None
:
output_vrt
+=
'
</ent>
\n
'
current_ent
=
None
if
not
line
.
rstrip
().
endswith
(
'
>
'
):
multi_line_tag_definition
=
True
if
line
.
startswith
(
'
<text
'
):
output_vrt
+=
'
<text>
\n
'
if
line
.
startswith
(
'
</text>
'
):
output_vrt
+=
'
</text>
\n
'
elif
line
.
startswith
(
'
<s
'
):
output_vrt
+=
'
<s>
\n
'
elif
line
.
startswith
(
'
</s>
'
):
output_vrt
+=
'
</s>
\n
'
elif
line
.
startswith
(
'
<ent
'
):
output_vrt
+=
line
elif
line
.
startswith
(
'
</ent>
'
):
output_vrt
+=
line
continue
if
multi_line_tag_definition
and
line
.
rstrip
().
endswith
(
'
>
'
):
multi_line_tag_definition
=
False
continue
pos_attrs
=
line
.
rstrip
(
'
\n
'
).
split
(
'
\t
'
)
if
not
has_ent_as_s_attr
:
if
pos_attrs
[
4
].
lower
()
in
[
'
null
'
,
'
none
'
]:
if
current_ent
:
output_vrt
+=
'
</ent>
\n
'
current_ent
=
None
else
:
if
current_ent
is
None
:
output_vrt
+=
f
'
<ent type=
"
{
pos_attrs
[
4
]
}
"
>
\n
'
current_ent
=
pos_attrs
[
4
]
elif
current_ent
!=
pos_attrs
[
4
]:
output_vrt
+=
'
</ent>
\n
'
current_ent
=
None
output_vrt
+=
f
'
<ent type=
"
{
pos_attrs
[
4
]
}
"
>
\n
'
current_ent
=
pos_attrs
[
4
]
output_vrt
+=
pos_attrs_to_string_function
(
pos_attrs
)
with
open
(
output_file
,
'
w
'
)
as
f
:
f
.
write
(
output_vrt
)
This diff is collapsed.
Click to expand it.
app/models.py
+
8
−
1
View file @
4146e378
from
app.converters.vrt
import
normalize_vrt_file
from
datetime
import
datetime
,
timedelta
from
enum
import
IntEnum
from
flask
import
current_app
,
url_for
...
...
@@ -854,7 +855,13 @@ class Corpus(HashidMixin, db.Model):
def
build
(
self
):
corpus_element
=
ET
.
fromstring
(
'
<corpus>
\n
</corpus>
'
)
for
corpus_file
in
self
.
files
:
element_tree
=
ET
.
parse
(
corpus_file
.
path
)
normalized_vrt_path
=
os
.
path
.
join
(
self
.
path
,
'
cwb
'
,
f
'
{
corpus_file
.
id
}
.norm.vrt
'
)
try
:
normalize_vrt_file
(
corpus_file
.
path
,
normalized_vrt_path
)
except
:
self
.
status
=
CorpusStatus
.
FAILED
return
element_tree
=
ET
.
parse
(
normalized_vrt_path
)
text_element
=
element_tree
.
getroot
()
text_element
.
set
(
'
address
'
,
corpus_file
.
address
or
'
NULL
'
)
text_element
.
set
(
'
author
'
,
corpus_file
.
author
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment