Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
nopaque
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Monitor
Service Desk
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
Looking for advice? Join the
Matrix channel for GitLab users in Bielefeld
!
Show more breadcrumbs
SFB 1288 - INF
nopaque
Commits
5fdd67eb
Commit
5fdd67eb
authored
5 years ago
by
Stephan Porada
Browse files
Options
Downloads
Patches
Plain Diff
Add new CQiWrapper
parent
baf06d31
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
app/corpora/CQiWrapper/CQiClient.py
+1
-1
1 addition, 1 deletion
app/corpora/CQiWrapper/CQiClient.py
app/corpora/CQiWrapper/CQiWrapper.py
+154
-135
154 additions, 135 deletions
app/corpora/CQiWrapper/CQiWrapper.py
with
155 additions
and
136 deletions
app/corpora/CQiWrapper/CQiClient.py
+
1
−
1
View file @
5fdd67eb
from
.
import
CQi
import
CQi
import
socket
import
socket
import
struct
import
struct
...
...
This diff is collapsed.
Click to expand it.
app/corpora/CQiWrapper/CQiWrapper.py
+
154
−
135
View file @
5fdd67eb
from
.
CQiClient
import
CQiClient
from
CQiClient
import
CQiClient
import
multiprocessing
from
CQi
import
CONST_FIELD_MATCH
,
CONST_FIELD_MATCHEND
import
collections
import
collections
from
app
import
logger
# only works if imported into opaque web app
class
CQiWrapper
(
CQiClient
):
class
CQiWrapper
(
CQiClient
):
...
@@ -11,6 +12,8 @@ class CQiWrapper(CQiClient):
...
@@ -11,6 +12,8 @@ class CQiWrapper(CQiClient):
for ease of use. Also structures recieved data into python dictionaries.
for ease of use. Also structures recieved data into python dictionaries.
Keyword arguments:
Keyword arguments:
host -- host IP adress or hostname wher the cqp server is running
port -- port of the cqp server
username -- username used to connect to the cqp server
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
password -- password of the user to connect to the cqp server
"""
"""
...
@@ -32,12 +35,15 @@ class CQiWrapper(CQiClient):
...
@@ -32,12 +35,15 @@ class CQiWrapper(CQiClient):
"""
"""
self
.
ctrl_connect
(
self
.
username
,
self
.
password
)
self
.
ctrl_connect
(
self
.
username
,
self
.
password
)
def
create_attribute_strings
(
self
):
def
__create_attribute_strings
(
self
):
"""
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
For example: CORPUS_NAME.word to query words
"""
p_attrs
=
self
.
corpus_positional_attributes
(
self
.
corpus_name
)
p_attrs
=
self
.
corpus_positional_attributes
(
self
.
corpus_name
)
struct_attrs
=
self
.
corpus_structural_attributes
(
self
.
corpus_name
)
struct_attrs
=
self
.
corpus_structural_attributes
(
self
.
corpus_name
)
self
.
meta_struct_element
=
struct_attrs
[
0
]
self
.
meta_struct_element
=
struct_attrs
[
0
]
print
(
p_attrs
)
print
(
struct_attrs
)
self
.
attr_strings
=
{}
self
.
attr_strings
=
{}
self
.
attr_strings
[
'
positional_attrs
'
]
=
{}
self
.
attr_strings
[
'
positional_attrs
'
]
=
{}
self
.
attr_strings
[
'
struct_attrs
'
]
=
{}
self
.
attr_strings
[
'
struct_attrs
'
]
=
{}
...
@@ -49,8 +55,17 @@ class CQiWrapper(CQiClient):
...
@@ -49,8 +55,17 @@ class CQiWrapper(CQiClient):
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr
]
=
(
self
.
corpus_name
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr
]
=
(
self
.
corpus_name
+
'
.
'
+
'
.
'
+
struct_attr
)
+
struct_attr
)
def
set_corpus_name
(
self
,
corpus_name
):
logger
.
warning
((
'
All positional and
'
self
.
corpus_name
=
corpus_name
'
structural attributes: {}
'
).
format
(
self
.
attr_strings
))
def
select_corpus
(
self
,
corpus_name
):
if
corpus_name
in
self
.
corpus_list_coprora
():
self
.
corpus_name
=
corpus_name
self
.
__create_attribute_strings
()
logger
.
warning
(
'
{} does exist.
'
.
format
(
corpus_name
))
else
:
self
.
disconnect
()
logger
.
warning
(
'
{} does not exist.
'
.
format
(
corpus_name
))
def
disconnect
(
self
):
def
disconnect
(
self
):
"""
"""
...
@@ -60,8 +75,9 @@ class CQiWrapper(CQiClient):
...
@@ -60,8 +75,9 @@ class CQiWrapper(CQiClient):
"""
"""
self
.
ctrl_bye
()
self
.
ctrl_bye
()
self
.
connection
.
close
()
self
.
connection
.
close
()
logger
.
warning
(
'
Disconnected from cqp server.
'
)
def
query_subcorpus
(
self
,
result_subcorpus_name
,
q
uery
):
def
query_subcorpus
(
self
,
query
,
result_subcorpus_name
=
'
Q
uery
-results
'
):
"""
"""
Create subcorpus
Create subcorpus
...
@@ -74,152 +90,155 @@ class CQiWrapper(CQiClient):
...
@@ -74,152 +90,155 @@ class CQiWrapper(CQiClient):
query -- query written in cqp query language
query -- query written in cqp query language
"""
"""
self
.
cqp_query
(
self
.
corpus_name
,
result_subcorpus_name
,
query
)
self
.
cqp_query
(
self
.
corpus_name
,
result_subcorpus_name
,
query
)
self
.
result_subcorpus
_ns
=
(
self
.
corpus_name
self
.
result_subcorpus
=
(
self
.
corpus_name
+
'
:
'
+
'
:
'
+
result_subcorpus_name
)
+
result_subcorpus_name
)
self
.
SUBCORPUS_NAMES
.
append
(
self
.
result_subcorpus
_ns
)
self
.
SUBCORPUS_NAMES
.
append
(
self
.
result_subcorpus
)
self
.
nr_matches
=
self
.
cqp_subcorpus_size
(
self
.
result_subcorpus
_ns
)
self
.
nr_matches
=
self
.
cqp_subcorpus_size
(
self
.
result_subcorpus
)
pr
in
t
(
'
Nr of all matches is:
'
,
self
.
nr_matches
)
logger
.
warn
in
g
(
'
Nr of all matches is:
{}
'
.
format
(
self
.
nr_matches
)
)
def
show_subcorpora
(
self
):
def
show_subcorpora
(
self
):
"""
Show all subcorpora currently saved by the cqp server.
"""
return
self
.
cqp_list_subcorpora
(
self
.
corpus_name
)
return
self
.
cqp_list_subcorpora
(
self
.
corpus_name
)
def
show_results
(
self
,
def
show_query_results
(
self
,
result_start_count
=
0
,
context_len
=
10
,
result_max_count
=
50
,
result_len
=
1000
):
context_len
=
10
,):
"""
"""
Show query results
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
Shows the actual matched strings produce by the query. Uses the cpos
match indexes to grab those strings. saves them into an orderd
match indexes to grab those strings. saves them into an orderd
dictionary. Also saves coresponding tags, lemmas and context:
dictionary. Also saves coresponding tags, lemmas and context. Gets those
OrderedDict([
informations using the corresponding cpos.
(0,
{
'
tokens
'
: [
'
Big
'
,
'
Brother
'
,
'
himself
'
],
'
lemmas
'
: [
'
big
'
,
'
brother
'
,
'
himself
'
],
'
pos_tags
'
: [
'
JJ
'
,
'
NN1
'
,
'
PPX1
'
],
'
sem_tags
'
: [
'
|A11.1+|N3.2+|N5+|
'
,
'
|S2.2m|S4m|S9/S2.2m|
'
,
'
|Z8m|
'
],
'
context_before
'
: [
'
figures
'
,
'
of
'
,
'
the
'
,
'
Party
'
,
'
,
'
,
'
almost
'
,
'
on
'
,
'
a
'
,
'
level
'
,
'
with
'
],
'
context_after
'
: [
'
,
'
,
'
and
'
,
'
then
'
,
'
had
'
,
'
engaged
'
,
'
in
'
,
'
counter-revolu-
'
,
'
tionary
'
,
'
activities
'
,
'
,
'
],
'
entry_title
'
:
'
1984
'
,
'
entry_author
'
:
'
george_orwell
'
,
'
cpos_start
'
: 110490,
'
cpos_end
'
: 110492
}
)
])
Keyword arguments:
Keyword arguments:
result_start_count -- start position of the dumped subcorpus.
(default 0) If it is 0 matches 0 to 50 will be shown. If it is 50
matches 50 to 100 will be shown.
result_max_count -- defines how many matches at once will be shown.
(default 50)
context_len -- defines how many words before and after a match will be
context_len -- defines how many words before and after a match will be
shown (default 10)
shown (default 10)
result_len -- defines how many results are actually grabbed
"""
"""
self
.
context_len
=
context_len
self
.
context_len
=
context_len
self
.
corpus_max_len
=
self
.
cl_attribute_size
(
self
.
attr_strings
[
'
positional_attrs
'
][
'
word
'
])
self
.
corpus_max_len
=
self
.
cl_attribute_size
(
self
.
attr_strings
[
'
positional_attrs
'
][
'
word
'
]
)
self
.
nr_matches
=
min
(
result_len
,
self
.
nr_matches
)
if
self
.
nr_matches
==
0
:
if
self
.
nr_matches
==
0
:
print
(
'
Query resulted in 0 matches.
'
)
logger
.
warning
(
'
Query resulted in 0 matches.
'
)
self
.
disconnect
return
None
else
:
else
:
if
self
.
nr_matches
<=
50
:
# Get match cpos boundries
matches_start
=
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus_ns
,
# match_boundries shows the start and end cpos of one match as a
0x10
,
# pair of cpositions
0
,
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
self
.
nr_matches
-
1
)
match_boundaries
=
zip
(
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus
,
matches_end
=
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus_ns
,
CONST_FIELD_MATCH
,
0x11
,
0
,
0
,
self
.
nr_matches
-
1
)
self
.
nr_matches
-
1
),
else
:
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus
,
matches_start
=
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus_ns
,
CONST_FIELD_MATCHEND
,
0x10
,
0
,
result_start_count
,
self
.
nr_matches
-
1
))
result_max_count
-
1
)
matches_end
=
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus_ns
,
# Generate all cpos between boundries including start and end boundries
0x11
,
# Save them as list into on match entry at serial number 'i'
result_start_count
,
ordered_matches
=
collections
.
OrderedDict
()
result_max_count
-
1
)
for
i
,
match_pair
in
enumerate
(
match_boundaries
):
match_indexes
=
zip
(
matches_start
,
matches_end
)
ordered_matches
[
i
]
=
({
'
match_cpos_list
'
:
list
(
range
(
match_pair
[
0
],
matches
=
[]
match_pair
[
1
]
+
1
))})
manager
=
multiprocessing
.
Manager
()
# Saves cpos form all match entries into one list
return_dict
=
manager
.
dict
()
all_cpos_list
=
[]
for
i
,
index_pair
in
enumerate
(
match_indexes
):
for
key
in
ordered_matches
.
keys
():
match
=
multiprocessing
.
Process
(
target
=
self
.
__get_matches
,
all_cpos_list
+=
ordered_matches
[
key
][
'
match_cpos_list
'
]
args
=
(
i
,
index_pair
,
# Saves all cpos from before and after context into the list:
self
.
corpus_name
,
# all_context_cpos_list
return_dict
))
all_context_cpos_list
=
[]
matches
.
append
(
match
)
for
key
in
ordered_matches
.
keys
():
match
.
start
()
cpos_list
=
ordered_matches
[
key
][
'
match_cpos_list
'
]
for
match
in
matches
:
before_index
=
max
([
0
,
cpos_list
[
0
]
-
self
.
context_len
])
match
.
join
()
after_index
=
min
([
self
.
corpus_max_len
,
# sort matches into ordered dict
cpos_list
[
-
1
]
+
self
.
context_len
])
ordered_results
=
collections
.
OrderedDict
()
ordered_matches
[
key
][
'
context_before_cpos_list
'
]
=
list
(
range
(
before_index
,
for
key
in
sorted
(
return_dict
.
keys
()):
cpos_list
[
0
]))
ordered_results
[
key
]
=
return_dict
[
key
]
ordered_matches
[
key
][
'
context_after_cpos_list
'
]
=
list
(
range
(
cpos_list
[
-
1
]
+
1
,
return
ordered_results
after_index
+
1
))
all_context_cpos_list
+=
ordered_matches
[
key
][
'
context_before_cpos_list
'
]
def
get_cpos_info
(
self
,
cpos
,
session
):
all_context_cpos_list
+=
ordered_matches
[
key
][
'
context_after_cpos_list
'
]
match_dict
=
{}
# Combines all_cpos_list with all_context_cpos_list as a sorted set
all_cpos_list
+=
all_context_cpos_list
all_cpos_list
=
sorted
(
list
(
set
(
all_cpos_list
)))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into the ordered_matches dict
all_cpos_infos
=
self
.
get_cpos_infos
(
all_cpos_list
)
for
key
in
ordered_matches
.
keys
():
# loops over cpos in cpos_list which holds all match cpos
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
cpos_list
=
ordered_matches
[
key
][
'
match_cpos_list
'
]
infos
=
[]
for
cpos
in
cpos_list
:
info
=
{
cpos
:
all_cpos_infos
.
get
(
cpos
)}
infos
.
append
(
info
)
ordered_matches
[
key
][
'
match_cpos_list
'
]
=
infos
try
:
# loops over cpos in ordered_matches[key]['context_before_cpos_list']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
before_context_infos
=
[]
for
context_before_cpos
in
ordered_matches
[
key
][
'
context_before_cpos_list
'
]:
before_context_info
=
{
context_before_cpos
:
all_cpos_infos
.
get
(
context_before_cpos
)}
before_context_infos
.
append
(
before_context_info
)
ordered_matches
[
key
][
'
context_before_cpos_list
'
]
=
before_context_infos
except
UnboundLocalError
:
logger
.
warning
(
'
Context before cpos list is empty.
'
)
try
:
# loops over cpos in ordered_matches[key]['context_after_cpos_list']
# which holds all cpos of the before context
# Replaces one cpos with the corresponding cpos information created
# by self.get_cpos_infos(all_cpos_list)
after_context_infos
=
[]
for
context_after_cpos
in
ordered_matches
[
key
][
'
context_after_cpos_list
'
]:
after_context_info
=
{
context_after_cpos
:
all_cpos_infos
.
get
(
context_after_cpos
)}
after_context_infos
.
append
(
after_context_info
)
ordered_matches
[
key
][
'
context_after_cpos_list
'
]
=
after_context_infos
except
UnboundLocalError
:
logger
.
warning
(
'
Context after cpos list is empty.
'
)
return
ordered_matches
def
get_cpos_infos
(
self
,
all_cpos
):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
cpos_infos
=
{}
for
attr_dict
in
self
.
attr_strings
:
for
attr_dict
in
self
.
attr_strings
:
# print(self.attr_strings[attr_dict])
if
attr_dict
==
'
positional_attrs
'
:
if
attr_dict
==
'
positional_attrs
'
:
for
p_attr_key
in
self
.
attr_strings
[
attr_dict
].
keys
():
for
p_attr_key
in
self
.
attr_strings
[
attr_dict
].
keys
():
# print(
p_attr_key
)
match_str
=
self
.
cl_cpos2str
(
self
.
attr_strings
[
attr_dict
][
p_attr_key
],
match_str
=
session
.
cl_cpos2str
(
self
.
attr_strings
[
attr_dict
][
p_attr_key
],
range
(
cpos
[
0
],
cpos
[
1
])
)
all_cpos
)
match_dict
[
p_attr_key
]
=
match_str
cpos_infos
[
p_attr_key
]
=
match_str
elif
attr_dict
==
'
struct_attrs
'
:
elif
attr_dict
==
'
struct_attrs
'
:
for
struct_attr_key
in
self
.
attr_strings
[
attr_dict
].
keys
():
for
struct_attr_key
in
self
.
attr_strings
[
attr_dict
].
keys
():
# print(struct_attr_key)
struct_entry
=
self
.
cl_cpos2struc
(
self
.
attr_strings
[
'
struct_attrs
'
][
self
.
meta_struct_element
],
struct_entry
=
session
.
cl_cpos2struc
(
self
.
attr_strings
[
'
struct_attrs
'
][
self
.
meta_struct_element
],
all_cpos
)
range
(
cpos
[
0
],
cpos
[
1
]))
match_str
=
self
.
cl_struc2str
(
self
.
attr_strings
[
attr_dict
][
struct_attr_key
],
struct_entry
)
match_str
=
session
.
cl_struc2str
(
self
.
attr_strings
[
attr_dict
][
struct_attr_key
],
struct_entry
)
cpos_infos
[
struct_attr_key
]
=
match_str
match_dict
[
struct_attr_key
]
=
set
(
match_str
)
tmp_list
=
[]
return
match_dict
attr_key_list
=
[]
for
key
in
cpos_infos
.
keys
():
def
__get_matches
(
self
,
i
,
index_pair
,
corpus_name
,
return_dict
):
tmp_list
.
append
(
cpos_infos
[
key
])
"""
attr_key_list
.
append
(
key
)
Get matches as readable output
joined_cpos_infos
=
zip
(
all_cpos
,
*
tmp_list
)
dict_cpos_infos
=
{}
Gets the actual match strings of cpos match indexes. Private helper
for
info
in
joined_cpos_infos
:
method used in show_results.
dict_cpos_infos
[
info
[
0
]]
=
dict
(
zip
(
attr_key_list
,
info
[
1
:]))
return
dict_cpos_infos
Keyword arguments:
i -- serial number for match at given cpos
index_pair -- match start and match end cpos
corpus_name -- name of the parent corpus
return_dict -- dictionary created with manager.dict() that holds the
extracted strings tags etc.
"""
# print('START:', index_pair[0])
# print('END:', index_pair[1])
# print('=============================')
index_pair
=
[
index_pair
[
0
],
index_pair
[
1
]
+
1
]
tmp_session
=
CQiWrapper
(
username
=
self
.
username
,
password
=
self
.
password
,
host
=
self
.
host
,
port
=
self
.
port
)
tmp_session
.
connect
()
match
=
self
.
get_cpos_info
(
index_pair
,
tmp_session
)
before_index
=
max
([
0
,
index_pair
[
0
]
-
self
.
context_len
])
after_index
=
min
([
self
.
corpus_max_len
,
index_pair
[
1
]
+
self
.
context_len
])
context_before
=
tmp_session
.
cl_cpos2str
(
self
.
attr_strings
[
'
positional_attrs
'
][
'
word
'
],
range
(
before_index
,
index_pair
[
0
]))
context_after
=
tmp_session
.
cl_cpos2str
(
self
.
attr_strings
[
'
positional_attrs
'
][
'
word
'
],
range
(
index_pair
[
1
]
+
1
,
after_index
+
1
))
tmp_dict
=
{
'
context_before
'
:
context_before
,
'
context_after
'
:
context_after
,
'
cpos_start
'
:
index_pair
[
0
],
'
cpos_end
'
:
index_pair
[
1
]}
match
.
update
(
tmp_dict
)
return_dict
[
i
]
=
match
tmp_session
.
disconnect
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment