Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
nopaque
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Monitor
Service Desk
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SFB 1288 - INF
nopaque
Commits
b3d5c15d
Commit
b3d5c15d
authored
5 years ago
by
Stephan Porada
Browse files
Options
Downloads
Patches
Plain Diff
CQiWrapper new data structure
parent
30f60b60
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
app/corpora/CQiWrapper/CQiWrapper.py
+90
-108
90 additions, 108 deletions
app/corpora/CQiWrapper/CQiWrapper.py
with
90 additions
and
108 deletions
app/corpora/CQiWrapper/CQiWrapper.py
+
90
−
108
View file @
b3d5c15d
from
.CQiClient
import
CQiClient
from
.CQi
import
CONST_FIELD_MATCH
,
CONST_FIELD_MATCHEND
import
collections
from
CQiClient
import
CQiClient
from
CQi
import
CONST_FIELD_MATCH
,
CONST_FIELD_MATCHEND
import
re
from
app
import
logger
# only works if imported into opaque web app
#
from app import logger # only works if imported into opaque web app
class
CQiWrapper
(
CQiClient
):
...
...
@@ -55,16 +54,16 @@ class CQiWrapper(CQiClient):
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr
]
=
(
self
.
corpus_name
+
'
.
'
+
struct_attr
)
logger
.
warning
((
'
All positional and
'
'
structural attributes: {}
'
).
format
(
self
.
attr_strings
))
#
logger.warning(('All positional and '
#
'structural attributes: {}').format(self.attr_strings))
def
select_corpus
(
self
,
corpus_name
):
if
corpus_name
in
self
.
corpus_list_coprora
():
self
.
corpus_name
=
corpus_name
self
.
__create_attribute_strings
()
logger
.
warning
(
'
{} does exist.
'
.
format
(
corpus_name
))
#
logger.warning('{} does exist.'.format(corpus_name))
else
:
logger
.
warning
(
'
{} does not exist.
'
.
format
(
corpus_name
))
#
logger.warning('{} does not exist.'.format(corpus_name))
pass
def
disconnect
(
self
):
...
...
@@ -75,7 +74,7 @@ class CQiWrapper(CQiClient):
"""
self
.
ctrl_bye
()
self
.
connection
.
close
()
logger
.
warning
(
'
Disconnected from cqp server.
'
)
#
logger.warning('Disconnected from cqp server.')
def
query_subcorpus
(
self
,
query
,
result_subcorpus_name
=
'
Query-results
'
):
"""
...
...
@@ -95,7 +94,7 @@ class CQiWrapper(CQiClient):
+
result_subcorpus_name
)
self
.
SUBCORPUS_NAMES
.
append
(
self
.
result_subcorpus
)
self
.
nr_matches
=
self
.
cqp_subcorpus_size
(
self
.
result_subcorpus
)
logger
.
warning
(
'
Nr of all matches is: {}
'
.
format
(
self
.
nr_matches
))
#
logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
def
show_subcorpora
(
self
):
"""
...
...
@@ -125,7 +124,7 @@ class CQiWrapper(CQiClient):
)
self
.
nr_matches
=
min
(
result_len
,
self
.
nr_matches
)
if
self
.
nr_matches
==
0
:
logger
.
warning
(
'
Query resulted in 0 matches.
'
)
#
logger.warning('Query resulted in 0 matches.')
return
None
else
:
# Get match cpos boundries
...
...
@@ -141,86 +140,49 @@ class CQiWrapper(CQiClient):
0
,
self
.
nr_matches
-
1
))
# Generate all cpos between boundries including start and end boundries
# Save them as list into on match entry at serial number 'i'
ordered_matches
=
collections
.
OrderedDict
()
for
i
,
match_pair
in
enumerate
(
match_boundaries
):
ordered_matches
[
i
]
=
({
'
match_cpos
'
:
list
(
range
(
match_pair
[
0
],
match_pair
[
1
]
+
1
))})
# Saves cpos form all match entries into one list
all_cpos_list
=
[]
for
key
in
ordered_matches
.
keys
():
all_cpos_list
+=
ordered_matches
[
key
][
'
match_cpos
'
]
# Saves all cpos from before and after context into the list:
# all_context_cpos_list
all_context_cpos_list
=
[]
for
key
in
ordered_matches
.
keys
():
cpos_list
=
ordered_matches
[
key
][
'
match_cpos
'
]
before_index
=
max
([
0
,
cpos_list
[
0
]
-
self
.
context_len
])
after_index
=
min
([
self
.
corpus_max_len
,
cpos_list
[
-
1
]
+
self
.
context_len
])
ordered_matches
[
key
][
'
left_context_cpos
'
]
=
list
(
range
(
before_index
,
cpos_list
[
0
]))
ordered_matches
[
key
][
'
right_context_cpos
'
]
=
list
(
range
(
cpos_list
[
-
1
]
+
1
,
after_index
+
1
))
all_context_cpos_list
+=
ordered_matches
[
key
][
'
left_context_cpos
'
]
all_context_cpos_list
+=
ordered_matches
[
key
][
'
right_context_cpos
'
]
# Combines all_cpos_list with all_context_cpos_list as a sorted set
all_cpos_list
+=
all_context_cpos_list
all_cpos_list
=
sorted
(
list
(
set
(
all_cpos_list
)))
# Generate all cpos between match boundries including start and end boundries.
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# all cpos informations
all_matches
=
[]
all_cpos
=
[]
for
start
,
end
in
match_boundaries
:
lc_cpos
=
list
(
range
(
max
([
0
,
start
-
self
.
context_len
]),
start
))
lc
=
{
'
lc
'
:
lc_cpos
}
match_cpos
=
list
(
range
(
start
,
end
+
1
))
match
=
{
'
hit
'
:
match_cpos
}
rc_cpos
=
list
(
range
(
end
+
1
,
min
([
self
.
corpus_max_len
,
end
+
self
.
context_len
+
1
])))
rc
=
{
'
rc
'
:
rc_cpos
}
lc
.
update
(
match
)
lc
.
update
(
rc
)
all_cpos
.
extend
(
lc_cpos
+
match_cpos
+
rc_cpos
)
all_matches
.
append
(
lc
)
# print(all_matches)
# print(all_cpos)
# Get all sentences IDs for all above collected cpos in all_cpos
s_ids
=
self
.
cl_cpos2struc
(
'
UTOPIEN.s
'
,
all_cpos
)
# CHANGE to CORPUS.s will always be like this in nopaque
# Get all cpos for all sneteces boundries
s_lookup
=
{}
for
s_id
in
set
(
s_ids
):
s_start
,
s_end
=
self
.
cl_struc2cpos
(
'
UTOPIEN.s
'
,
s_id
)
# CHANGE to CORPUS.s will always be like this in nopaque
# print(s_start, s_end)
s_cpos
=
range
(
s_start
,
s_end
)
s_lookup
.
update
({
s_id
:
list
(
s_cpos
)})
# print(list(s_cpos))
all_cpos
.
extend
(
s_cpos
)
all_cpos
=
list
(
set
(
all_cpos
))
# get rid of cpos duplicates
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into the ordered_matches dict
all_cpos_infos
,
s_list
=
self
.
get_cpos_infos
(
all_cpos_list
)
for
key
in
ordered_matches
.
keys
():
# loops over cpos in cpos_list which holds all match cpos
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
cpos_list
=
ordered_matches
[
key
][
'
match_cpos
'
]
infos
=
[]
for
cpos
in
cpos_list
:
info
=
{
cpos
:
all_cpos_infos
.
get
(
cpos
)}
infos
.
append
(
info
)
ordered_matches
[
key
][
'
match_cpos
'
]
=
infos
try
:
# loops over cpos in ordered_matches[key]['left_context_cpos']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
before_context_infos
=
[]
for
context_before_cpos
in
ordered_matches
[
key
][
'
left_context_cpos
'
]:
before_context_info
=
{
context_before_cpos
:
all_cpos_infos
.
get
(
context_before_cpos
)}
before_context_infos
.
append
(
before_context_info
)
ordered_matches
[
key
][
'
left_context_cpos
'
]
=
before_context_infos
except
UnboundLocalError
:
logger
.
warning
(
'
Context before cpos list is empty.
'
)
pass
try
:
# loops over cpos in ordered_matches[key]['right_context_cpos']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
after_context_infos
=
[]
for
context_after_cpos
in
ordered_matches
[
key
][
'
right_context_cpos
'
]:
after_context_info
=
{
context_after_cpos
:
all_cpos_infos
.
get
(
context_after_cpos
)}
after_context_infos
.
append
(
after_context_info
)
ordered_matches
[
key
][
'
right_context_cpos
'
]
=
after_context_infos
except
UnboundLocalError
:
logger
.
warning
(
'
Context after cpos list is empty.
'
)
pass
sentences
=
{}
s_list
=
set
(
s_list
)
for
s_id
in
s_list
:
s_start
,
s_end
=
self
.
cl_struc2cpos
(
'
CORPUS.s
'
,
s_id
)
sentence
=
self
.
cl_cpos2str
(
'
CORPUS.word
'
,
range
(
s_start
,
s_end
+
1
))
sentences
.
update
({
s_id
:
re
.
sub
(
r
'
(?=\W)
'
,
''
,
'
'
.
join
(
sentence
))})
ordered_matches
[
'
sentences
'
]
=
sentences
return
ordered_matches
# Also saves these informations into self.results dict
all_cpos_infos
,
text_lookup
=
self
.
get_cpos_infos
(
all_cpos
)
self
.
results
=
{
'
matches
'
:
all_matches
,
'
cpos_lookup
'
:
all_cpos_infos
,
'
s_lookup
'
:
s_lookup
,
'
text_lookup
'
:
text_lookup
}
return
self
.
results
# print(self.results)
def
get_cpos_infos
(
self
,
all_cpos
):
'''
...
...
@@ -228,25 +190,42 @@ class CQiWrapper(CQiClient):
all cpos entries specified in the parameter all_cpos.
'''
cpos_infos
=
{}
s_list
=
[]
for
key
in
self
.
attr_strings
.
keys
():
if
key
==
'
positional_attrs
'
:
for
p_attr_key
in
self
.
attr_strings
[
key
].
keys
():
match_strs
=
self
.
cl_cpos2str
(
self
.
attr_strings
[
key
][
p_attr_key
],
all_cpos
)
cpos_infos
[
p_attr_key
]
=
match_strs
elif
key
==
'
struct_attrs
'
:
for
struct_attr_key
in
self
.
attr_strings
[
key
].
keys
():
struct_entry
=
self
.
cl_cpos2struc
(
self
.
attr_strings
[
key
][
struct_attr_key
],
all_cpos
)
has_value
=
self
.
corpus_structural_attribute_has_values
(
self
.
attr_strings
[
key
][
struct_attr_key
])
if
has_value
:
match_strs
=
self
.
cl_struc2str
(
self
.
attr_strings
[
key
][
struct_attr_key
],
struct_entry
)
elif
self
.
attr_strings
[
key
][
struct_attr_key
]
==
'
CORPUS.s
'
:
s_list
.
extend
(
struct_entry
)
else
:
match_strs
=
[
None
for
i
in
struct_entry
]
cpos_infos
[
struct_attr_key
]
=
zip
(
struct_entry
,
match_strs
)
for
p_attr_key
in
self
.
attr_strings
[
'
positional_attrs
'
].
keys
():
match_strs
=
self
.
cl_cpos2str
(
self
.
attr_strings
[
'
positional_attrs
'
][
p_attr_key
],
all_cpos
)
cpos_infos
[
p_attr_key
]
=
match_strs
tmp_s_info
=
[]
tmp_text_info
=
[]
text_lookup
=
{}
tmp_dict
=
{}
for
struct_attr_key
in
self
.
attr_strings
[
'
struct_attrs
'
].
keys
():
check
=
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr_key
]
if
check
==
'
UTOPIEN.s
'
:
struct_ids
=
self
.
cl_cpos2struc
(
check
,
all_cpos
)
for
id
in
struct_ids
:
tmp_s_info
.
append
({
struct_attr_key
:
id
})
elif
check
==
'
UTOPIEN.entry
'
:
struct_ids
=
self
.
cl_cpos2struc
(
check
,
all_cpos
)
for
id
in
struct_ids
:
tmp_text_info
.
append
({
struct_attr_key
:
id
})
else
:
struct_ids
=
struct_ids
=
self
.
cl_cpos2struc
(
check
,
all_cpos
)
struct_values
=
self
.
cl_struc2str
(
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr_key
],
struct_ids
)
for
value
in
struct_values
:
for
id
in
struct_ids
:
tmp_dict
.
update
({
id
:
{
struct_attr_key
:
value
}})
print
(
tmp_dict
)
print
(
text_lookup
)
# struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
# has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
# if has_value:
# match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
# elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
# pass
# else:
# match_strs = [None for i in struct_entry]
# cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
tmp_list
=
[]
attr_key_list
=
[]
for
key
in
cpos_infos
.
keys
():
...
...
@@ -256,4 +235,7 @@ class CQiWrapper(CQiClient):
dict_cpos_infos
=
{}
for
info
in
joined_cpos_infos
:
dict_cpos_infos
[
info
[
0
]]
=
dict
(
zip
(
attr_key_list
,
info
[
1
:]))
return
dict_cpos_infos
,
s_list
for
key
,
s_id
,
text_id
in
zip
(
dict_cpos_infos
.
keys
(),
tmp_s_info
,
tmp_text_info
):
dict_cpos_infos
[
key
].
update
(
s_id
)
dict_cpos_infos
[
key
].
update
(
text_id
)
return
dict_cpos_infos
,
text_lookup
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment