Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
nopaque
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Monitor
Service Desk
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
Looking for advice? Join the
Matrix channel for GitLab users in Bielefeld
!
Show more breadcrumbs
SFB 1288 - INF
nopaque
Commits
3af400a7
Commit
3af400a7
authored
5 years ago
by
Stephan Porada
Browse files
Options
Downloads
Patches
Plain Diff
Add get_sentences to wrapper
parent
6317a479
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
app/corpora/CQiWrapper/CQiWrapper.py
+94
-49
94 additions, 49 deletions
app/corpora/CQiWrapper/CQiWrapper.py
with
94 additions
and
49 deletions
app/corpora/CQiWrapper/CQiWrapper.py
+
94
−
49
View file @
3af400a7
...
@@ -5,7 +5,7 @@ from app import logger # only works if imported into opaque web app
...
@@ -5,7 +5,7 @@ from app import logger # only works if imported into opaque web app
class
CQiWrapper
(
CQiClient
):
class
CQiWrapper
(
CQiClient
):
"""
'''
CQIiWrapper object
CQIiWrapper object
High level wrapper that groups and renames some functions of CQiClient
High level wrapper that groups and renames some functions of CQiClient
...
@@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
...
@@ -16,7 +16,7 @@ class CQiWrapper(CQiClient):
port -- port of the cqp server
port -- port of the cqp server
username -- username used to connect to the cqp server
username -- username used to connect to the cqp server
password -- password of the user to connect to the cqp server
password -- password of the user to connect to the cqp server
"""
'''
SUBCORPUS_NAMES
=
[]
SUBCORPUS_NAMES
=
[]
...
@@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
...
@@ -27,20 +27,21 @@ class CQiWrapper(CQiClient):
self
.
password
=
password
self
.
password
=
password
def
connect
(
self
):
def
connect
(
self
):
"""
'''
Connect with CQP server
Connect with CQP server
Connects via socket to the CQP server using the given username and
Connects via socket to the CQP server using the given username and
password from class initiation.
password from class initiation.
"""
'''
self
.
ctrl_connect
(
self
.
username
,
self
.
password
)
self
.
ctrl_connect
(
self
.
username
,
self
.
password
)
def
__create_attribute_strings
(
self
):
def
__create_attribute_strings
(
self
):
"""
'''
Creates all needed attribute strings to query for word, lemma etc. in
Creates all needed attribute strings to query for word, lemma etc. in
the given corpus.
the given corpus.
For example: CORPUS_NAME.word to query words
For example: CORPUS_NAME.word to query words
"""
Automaticalle creates strings for all pre defined tags.
'''
p_attrs
=
self
.
corpus_positional_attributes
(
self
.
corpus_name
)
p_attrs
=
self
.
corpus_positional_attributes
(
self
.
corpus_name
)
struct_attrs
=
self
.
corpus_structural_attributes
(
self
.
corpus_name
)
struct_attrs
=
self
.
corpus_structural_attributes
(
self
.
corpus_name
)
self
.
attr_strings
=
{}
self
.
attr_strings
=
{}
...
@@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
...
@@ -54,40 +55,45 @@ class CQiWrapper(CQiClient):
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr
]
=
(
self
.
corpus_name
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr
]
=
(
self
.
corpus_name
+
'
.
'
+
'
.
'
+
struct_attr
)
+
struct_attr
)
#
logger.warning(('All positional and '
logger
.
warning
((
'
All positional and
'
#
'structural attributes: {}').format(self.attr_strings))
'
structural attributes: {}
'
).
format
(
self
.
attr_strings
))
def
select_corpus
(
self
,
corpus_name
):
def
select_corpus
(
self
,
corpus_name
):
'''
Checks if given copus name exists. If it exists set it as the main
corpus name used to create the needed query attribute strings like
CORPUS_NAME.word.
'''
if
corpus_name
in
self
.
corpus_list_coprora
():
if
corpus_name
in
self
.
corpus_list_coprora
():
self
.
corpus_name
=
corpus_name
self
.
corpus_name
=
corpus_name
self
.
__create_attribute_strings
()
self
.
__create_attribute_strings
()
#
logger.warning('{} does exist.'.format(corpus_name))
logger
.
warning
(
'
{} does exist.
'
.
format
(
corpus_name
))
else
:
else
:
#
logger.warning('{} does not exist.'.format(corpus_name))
logger
.
warning
(
'
{} does not exist.
'
.
format
(
corpus_name
))
pass
raise
Exception
(
'
Given Corpus Name is not in corpora list.
'
)
def
disconnect
(
self
):
def
disconnect
(
self
):
"""
'''
Disconnect from CQP server
Disconnect from CQP server
Disconnects from the CQP server. Closes used socket after disconnect.
Disconnects from the CQP server. Closes used socket after disconnect.
"""
'''
self
.
ctrl_bye
()
self
.
ctrl_bye
()
self
.
connection
.
close
()
self
.
connection
.
close
()
#
logger.warning('Disconnected from cqp server.')
logger
.
warning
(
'
Disconnected from cqp server.
'
)
def
query_subcorpus
(
self
,
query
,
result_subcorpus_name
=
'
Query-results
'
):
def
query_subcorpus
(
self
,
query
,
result_subcorpus_name
=
'
Query-results
'
):
"""
'''
Create subcorpus
Create subcorpus
Input query will be used to create a subcorpus holding all cpos match
Input query will be used to create a subcorpus holding all cpos match
positions for that query.
positions for that query.
Keyword arguments:
Keyword arguments:
result_subcorpus_name --
user
set name of the subcorpus which holds all
result_subcorpus_name -- set name of the subcorpus which holds all
cpos match positions, produced by the query
cpos match positions, produced by the query
query -- query written in cqp query language
query -- query written in cqp query language
"""
'''
self
.
cqp_query
(
self
.
corpus_name
,
result_subcorpus_name
,
query
)
self
.
cqp_query
(
self
.
corpus_name
,
result_subcorpus_name
,
query
)
self
.
result_subcorpus
=
(
self
.
corpus_name
self
.
result_subcorpus
=
(
self
.
corpus_name
+
'
:
'
+
'
:
'
...
@@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
...
@@ -95,19 +101,19 @@ class CQiWrapper(CQiClient):
self
.
SUBCORPUS_NAMES
.
append
(
self
.
result_subcorpus
)
self
.
SUBCORPUS_NAMES
.
append
(
self
.
result_subcorpus
)
self
.
nr_matches
=
self
.
cqp_subcorpus_size
(
self
.
result_subcorpus
)
self
.
nr_matches
=
self
.
cqp_subcorpus_size
(
self
.
result_subcorpus
)
print
(
'
Nr of all matches is:
'
,
self
.
nr_matches
)
print
(
'
Nr of all matches is:
'
,
self
.
nr_matches
)
#
logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
logger
.
warning
(
'
Nr of all matches is: {}
'
.
format
(
self
.
nr_matches
))
def
show_subcorpora
(
self
):
def
show_subcorpora
(
self
):
"""
'''
Show all subcorpora currently saved by the cqp server.
Show all subcorpora currently saved by the cqp server.
"""
'''
return
self
.
cqp_list_subcorpora
(
self
.
corpus_name
)
return
self
.
cqp_list_subcorpora
(
self
.
corpus_name
)
def
show_query_results
(
self
,
def
show_query_results
(
self
,
context_len
=
10
,
context_len
=
10
,
result_len
=
1000
,
result_len
=
1000
,
result_offset
=
0
):
result_offset
=
0
):
"""
'''
Show query results
Show query results
Shows the actual matched strings produce by the query. Uses the cpos
Shows the actual matched strings produce by the query. Uses the cpos
...
@@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
...
@@ -118,15 +124,20 @@ class CQiWrapper(CQiClient):
Keyword arguments:
Keyword arguments:
context_len -- defines how many words before and after a match will be
context_len -- defines how many words before and after a match will be
shown (default 10)
shown (default 10)
result_len -- defines how many results are actually grabbed
result_len -- defines for how many matches all informations like lemma
"""
and POS are being grabbed
result_offset -- defines the offset of the matches being requested. If
the offset is 100 informations for matches 100 to result_len are being
grabbed
'''
t0
=
time
.
time
()
self
.
context_len
=
context_len
self
.
context_len
=
context_len
self
.
corpus_max_len
=
self
.
cl_attribute_size
(
self
.
corpus_max_len
=
self
.
cl_attribute_size
(
self
.
attr_strings
[
'
positional_attrs
'
][
'
word
'
]
self
.
attr_strings
[
'
positional_attrs
'
][
'
word
'
]
)
)
self
.
nr_matches
=
min
(
result_len
,
self
.
nr_matches
)
self
.
nr_matches
=
min
(
result_len
,
self
.
nr_matches
)
if
self
.
nr_matches
==
0
:
if
self
.
nr_matches
==
0
:
#
logger.warning('Query resulted in 0 matches.')
logger
.
warning
(
'
Query resulted in 0 matches.
'
)
return
None
return
None
else
:
else
:
# Get match cpos boundries
# Get match cpos boundries
...
@@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
...
@@ -144,7 +155,8 @@ class CQiWrapper(CQiClient):
offset_start
,
offset_start
,
offset_end
))
offset_end
))
# Generate all cpos between match boundries including start and end boundries.
# Generate all cpos between match boundries including start and end
# boundries.
# Also generate cpos for left and right context.
# Also generate cpos for left and right context.
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Save those cpos into dict as lists for the keys 'lc', 'hit' and 'rc'
# Also collect all cpos together in one list for the final request of
# Also collect all cpos together in one list for the final request of
...
@@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
...
@@ -157,42 +169,34 @@ class CQiWrapper(CQiClient):
lc
=
{
'
lc
'
:
lc_cpos
}
lc
=
{
'
lc
'
:
lc_cpos
}
match_cpos
=
list
(
range
(
start
,
end
))
match_cpos
=
list
(
range
(
start
,
end
))
match
=
{
'
hit
'
:
match_cpos
}
match
=
{
'
hit
'
:
match_cpos
}
rc_cpos
=
list
(
range
(
end
,
min
([
self
.
corpus_max_len
,
end
+
self
.
context_len
])))
rc_cpos
=
list
(
range
(
end
,
min
([
self
.
corpus_max_len
,
end
+
self
.
context_len
])))
rc
=
{
'
rc
'
:
rc_cpos
}
rc
=
{
'
rc
'
:
rc_cpos
}
lc
.
update
(
match
)
lc
.
update
(
match
)
lc
.
update
(
rc
)
lc
.
update
(
rc
)
all_cpos
.
extend
(
lc_cpos
+
match_cpos
+
rc_cpos
)
all_cpos
.
extend
(
lc_cpos
+
match_cpos
+
rc_cpos
)
all_matches
.
append
(
lc
)
all_matches
.
append
(
lc
)
# print(all_matches)
# print(all_cpos)
all_cpos
=
list
(
set
(
all_cpos
))
# get rid of cpos duplicates
len_all_cpos
=
len
(
all_cpos
)
# Get all cpos for all sneteces boundries
# s_lookup = {}
# for s_id in set(s_ids):
# s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
# # CHANGE to UTOPIEN.s will always be like this in nopaque
# s_cpos = range(s_start, s_end)
# s_lookup.update({s_id: list(s_cpos)})
# # print(list(s_cpos))
# all_cpos.extend(s_cpos)
t0
=
time
.
time
()
all_cpos
=
list
(
set
(
all_cpos
))
# get rid of cpos duplicates
t1
=
time
.
time
()
t1
=
time
.
time
()
t_total
=
t1
-
t0
t_total
=
t1
-
t0
pr
in
t
(
'
T
IME FOR ALL CPOS:
'
,
t_total
)
logger
.
warn
in
g
(
'
T
ime to create all CPOS for query: {}
'
.
format
(
t_total
)
)
print
(
'
CPOS SUM:
'
,
len
(
all_cpos
))
print
(
'
Requesting {} CPOS with one query.
'
.
format
(
len
_
all_cpos
))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# all cpos entries in all_cpos_list
# Also saves these informations into self.results dict
# Also saves these informations into self.results dict
t
6
=
time
.
time
()
t
2
=
time
.
time
()
all_cpos_infos
,
text_lookup
=
self
.
get_cpos_infos
(
all_cpos
)
all_cpos_infos
,
text_lookup
=
self
.
get_cpos_infos
(
all_cpos
)
t7
=
time
.
time
()
t3
=
time
.
time
()
t_final
=
t7
-
t6
t_final
=
t3
-
t2
print
(
'
GOT ALL RESULTS IN:
'
,
t_final
)
print
(
'
Got infos for {} CPOS in {} seconds:
'
.
format
(
len_all_cpos
,
t_final
))
self
.
results
=
{
'
matches
'
:
all_matches
,
'
cpos_lookup
'
:
all_cpos_infos
,
self
.
results
=
{
'
matches
'
:
all_matches
,
'
text_lookup
'
:
text_lookup
}
'
cpos_lookup
'
:
all_cpos_infos
,
'
text_lookup
'
:
text_lookup
,
'
nr_matches
'
:
self
.
nr_matches
}
return
self
.
results
return
self
.
results
def
get_cpos_infos
(
self
,
all_cpos
):
def
get_cpos_infos
(
self
,
all_cpos
):
...
@@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
...
@@ -250,3 +254,44 @@ class CQiWrapper(CQiClient):
for
info
in
joined_cpos_infos
:
for
info
in
joined_cpos_infos
:
dict_cpos_infos
[
info
[
0
]]
=
dict
(
zip
(
attr_keys_list
,
info
[
1
:]))
dict_cpos_infos
[
info
[
0
]]
=
dict
(
zip
(
attr_keys_list
,
info
[
1
:]))
return
dict_cpos_infos
,
text_lookup
return
dict_cpos_infos
,
text_lookup
def
get_sentences
(
self
,
match_cpos_list
,
get_surrounding_s
=
False
,
l_r_s_context_additional_len
=
1
):
'''
Get sentence informations for one match also set if and how much left
right context sentences should be grabbed surrounding the given CPOS.
'''
t0
=
time
.
time
()
key
=
self
.
corpus_name
+
'
.s
'
first_cpos
,
last_cpos
=
match_cpos_list
[
0
],
match_cpos_list
[
-
1
]
context_sentences
=
{}
s_ids
=
self
.
cl_cpos2struc
(
key
,
[
first_cpos
,
last_cpos
])
for
s_id
in
s_ids
:
s_start
,
s_end
=
self
.
cl_struc2cpos
(
key
,
s_id
)
s_cpos
=
list
(
range
(
s_start
,
s_end
+
1
))
context_sentences
[
s_id
]
=
s_cpos
if
get_surrounding_s
:
max_s_id
=
self
.
cl_attribute_size
(
key
)
additional_s_ids
=
[]
additional_s
=
list
(
range
(
max
(
s_ids
[
0
]
-
l_r_s_context_additional_len
,
0
),
min
(
s_ids
[
-
1
]
+
l_r_s_context_additional_len
,
max_s_id
)
+
1
))
additional_s_ids
.
extend
(
additional_s
)
for
s_id
in
additional_s_ids
:
s_start
,
s_end
=
self
.
cl_struc2cpos
(
key
,
s_id
)
s_cpos
=
list
(
range
(
s_start
,
s_end
+
1
))
context_sentences
[
s_id
]
=
s_cpos
all_cpos
=
[]
for
key
in
context_sentences
.
keys
():
all_cpos
.
extend
(
context_sentences
[
key
])
all_cpos
=
list
(
set
(
all_cpos
))
all_cpos_infos
,
text_lookup
=
self
.
get_cpos_infos
(
all_cpos
)
t1
=
time
.
time
()
t_total
=
t1
-
t0
logger
.
warning
(
'
Got all sentences informations in {} seconds
'
.
format
(
t_total
))
return
context_sentences
,
all_cpos_infos
,
text_lookup
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment