Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
nopaque
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Monitor
Service Desk
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
Looking for advice? Join the
Matrix channel for GitLab users in Bielefeld
!
Show more breadcrumbs
SFB 1288 - INF
nopaque
Commits
dbd580b3
Commit
dbd580b3
authored
5 years ago
by
Stephan Porada
Browse files
Options
Downloads
Patches
Plain Diff
Get results with wrapper 3.0
parent
dec90e30
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
app/corpora/CQiWrapper/CQiWrapper.py
+67
-57
67 additions, 57 deletions
app/corpora/CQiWrapper/CQiWrapper.py
app/corpora/events.py
+5
-42
5 additions, 42 deletions
app/corpora/events.py
app/templates/corpora/analyse_corpus.html.j2
+5
-4
5 additions, 4 deletions
app/templates/corpora/analyse_corpus.html.j2
with
77 additions
and
103 deletions
app/corpora/CQiWrapper/CQiWrapper.py
+
67
−
57
View file @
dbd580b3
from
.CQiClient
import
CQiClient
from
.CQi
import
CONST_FIELD_MATCH
,
CONST_FIELD_MATCHEND
import
r
e
import
tim
e
from
app
import
logger
# only works if imported into opaque web app
...
...
@@ -94,6 +94,7 @@ class CQiWrapper(CQiClient):
+
result_subcorpus_name
)
self
.
SUBCORPUS_NAMES
.
append
(
self
.
result_subcorpus
)
self
.
nr_matches
=
self
.
cqp_subcorpus_size
(
self
.
result_subcorpus
)
print
(
'
Nr of all matches is:
'
,
self
.
nr_matches
)
# logger.warning('Nr of all matches is: {}'.format(self.nr_matches))
def
show_subcorpora
(
self
):
...
...
@@ -104,7 +105,8 @@ class CQiWrapper(CQiClient):
def
show_query_results
(
self
,
context_len
=
10
,
result_len
=
1000
):
result_len
=
1000
,
result_offset
=
0
):
"""
Show query results
...
...
@@ -131,14 +133,16 @@ class CQiWrapper(CQiClient):
# match_boundries shows the start and end cpos of one match as a
# pair of cpositions
# [(1355, 1357), (1477, 1479)] Example for two boundry pairs
offset_start
=
0
+
(
result_offset
+
1
)
if
result_offset
!=
0
else
result_offset
offset_end
=
self
.
nr_matches
+
result_offset
match_boundaries
=
zip
(
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus
,
CONST_FIELD_MATCH
,
0
,
self
.
nr_matches
-
1
),
offset_start
,
offset_end
),
self
.
cqp_dump_subcorpus
(
self
.
result_subcorpus
,
CONST_FIELD_MATCHEND
,
0
,
self
.
nr_matches
-
1
))
offset_start
,
offset_end
))
# Generate all cpos between match boundries including start and end boundries.
# Also generate cpos for left and right context.
...
...
@@ -152,7 +156,7 @@ class CQiWrapper(CQiClient):
lc
=
{
'
lc
'
:
lc_cpos
}
match_cpos
=
list
(
range
(
start
,
end
+
1
))
match
=
{
'
hit
'
:
match_cpos
}
rc_cpos
=
list
(
range
(
end
+
1
,
min
([
self
.
corpus_max_len
,
end
+
self
.
context_len
+
1
])))
rc_cpos
=
list
(
range
(
end
,
min
([
self
.
corpus_max_len
,
end
+
self
.
context_len
])))
rc
=
{
'
rc
'
:
rc_cpos
}
lc
.
update
(
match
)
lc
.
update
(
rc
)
...
...
@@ -161,81 +165,87 @@ class CQiWrapper(CQiClient):
# print(all_matches)
# print(all_cpos)
# Get all sentences IDs for all above collected cpos in all_cpos
s_ids
=
self
.
cl_cpos2struc
(
'
CORPUS.s
'
,
all_cpos
)
# CHANGE to CORPUS.s will always be like this in nopaque
# Get all cpos for all sneteces boundries
s_lookup
=
{}
for
s_id
in
set
(
s_ids
):
s_start
,
s_end
=
self
.
cl_struc2cpos
(
'
CORPUS.s
'
,
s_id
)
# CHANGE to CORPUS.s will always be like this in nopaque
# print(s_start, s_end)
s_cpos
=
range
(
s_start
,
s_end
)
s_lookup
.
update
({
s_id
:
list
(
s_cpos
)})
# print(list(s_cpos))
all_cpos
.
extend
(
s_cpos
)
# s_lookup = {}
# for s_id in set(s_ids):
# s_start, s_end = self.cl_struc2cpos('UTOPIEN.s', s_id)
# # CHANGE to UTOPIEN.s will always be like this in nopaque
# s_cpos = range(s_start, s_end)
# s_lookup.update({s_id: list(s_cpos)})
# # print(list(s_cpos))
# all_cpos.extend(s_cpos)
t0
=
time
.
time
()
all_cpos
=
list
(
set
(
all_cpos
))
# get rid of cpos duplicates
t1
=
time
.
time
()
t_total
=
t1
-
t0
print
(
'
TIME FOR ALL CPOS:
'
,
t_total
)
print
(
'
CPOS SUM:
'
,
len
(
all_cpos
))
# Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
# all cpos entries in all_cpos_list
# Also saves these informations into self.results dict
t6
=
time
.
time
()
all_cpos_infos
,
text_lookup
=
self
.
get_cpos_infos
(
all_cpos
)
t7
=
time
.
time
()
t_final
=
t7
-
t6
print
(
'
GOT ALL RESULTS IN:
'
,
t_final
)
self
.
results
=
{
'
matches
'
:
all_matches
,
'
cpos_lookup
'
:
all_cpos_infos
,
'
s_lookup
'
:
s_lookup
,
'
text_lookup
'
:
text_lookup
}
'
text_lookup
'
:
text_lookup
}
return
self
.
results
# print(self.results)
def
get_cpos_infos
(
self
,
all_cpos
):
'''
Get cpos informations like CORPUS_NAME.word or CORPUS_NAME.lemma for
all cpos entries specified in the parameter all_cpos.
'''
# Get all positional attribute informations
cpos_infos
=
{}
for
p_attr_key
in
self
.
attr_strings
[
'
positional_attrs
'
].
keys
():
match_strs
=
self
.
cl_cpos2str
(
self
.
attr_strings
[
'
positional_attrs
'
][
p_attr_key
],
all_cpos
)
cpos_infos
[
p_attr_key
]
=
match_strs
tmp_s_info
=
[]
tmp_text_info
=
[]
text_lookup
=
{}
tmp_dict
=
{}
# Get all strucutural attribute informations
tmp_info
=
{}
structs_to_check
=
[]
for
struct_attr_key
in
self
.
attr_strings
[
'
struct_attrs
'
].
keys
():
check
=
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr_key
]
if
check
==
'
CORPUS.s
'
:
struct_ids
=
self
.
cl_cpos2struc
(
check
,
all_cpos
)
for
id
in
struct_ids
:
tmp_s_info
.
append
({
struct_attr_key
:
id
})
elif
check
==
'
CORPUS.text
'
:
struct_ids
=
self
.
cl_cpos2struc
(
check
,
all_cpos
)
key
=
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr_key
]
has_value
=
self
.
corpus_structural_attribute_has_values
(
key
)
struct_ids
=
self
.
cl_cpos2struc
(
key
,
all_cpos
)
if
has_value
is
False
:
# Get IDs of strucutural elements without values (this means get IDs of XML tags. Struct elements only have values if they are XML attributes)
tmp_info
[
struct_attr_key
]
=
[]
for
id
in
struct_ids
:
tmp_
text_info
.
append
({
struct_attr_key
:
id
}
)
tmp_
info
[
struct_attr_key
].
append
(
id
)
else
:
struct_ids
=
struct_ids
=
self
.
cl_cpos2struc
(
check
,
all_cpos
)
struct_values
=
self
.
cl_struc2str
(
self
.
attr_strings
[
'
struct_attrs
'
][
struct_attr_key
],
struct_ids
)
for
value
in
struct_values
:
for
id
in
struct_ids
:
tmp_dict
.
update
({
id
:
{
struct_attr_key
:
value
}})
print
(
tmp_dict
)
print
(
text_lookup
)
# struct_entry = self.cl_cpos2struc(self.attr_strings['struct_attrs'][struct_attr_key], all_cpos)
# has_value = self.corpus_structural_attribute_has_values(self.attr_strings['struct_attrs'][struct_attr_key])
# if has_value:
# match_strs = self.cl_struc2str(self.attr_strings['struct_attrs'][struct_attr_key], struct_entry)
# elif self.attr_strings['struct_attrs'][struct_attr_key] == 'CORPUS.s':
# pass
# else:
# match_strs = [None for i in struct_entry]
# cpos_infos[struct_attr_key] = zip(struct_entry, match_strs)
tmp_list
=
[]
attr_key_list
=
[]
structs_to_check
.
append
({
key
:
struct_attr_key
})
struct_attr_values
=
list
(
tmp_info
.
values
())
struct_attr_keys
=
list
(
tmp_info
.
keys
())
# Build textlookup dictionary
text_lookup_ids
=
list
(
set
(
struct_attr_values
[
0
]))
# First is always one text
text_lookup
=
{}
for
d
in
structs_to_check
:
s_key
,
s_value
=
zip
(
*
d
.
items
())
s_value
=
s_value
[
0
].
split
(
'
_
'
)[
1
]
struct_values
=
self
.
cl_struc2str
(
s_key
[
0
],
text_lookup_ids
)
zipped
=
dict
(
zip
(
text_lookup_ids
,
struct_values
))
for
zip_key
,
zip_value
in
zipped
.
items
():
check
=
text_lookup
.
get
(
zip_key
)
if
check
is
None
:
text_lookup
[
zip_key
]
=
{
s_value
:
zip_value
}
else
:
text_lookup
[
zip_key
].
update
({
s_value
:
zip_value
})
# zip keys and values together
attr_values_list
=
[]
attr_keys_list
=
[]
for
key
in
cpos_infos
.
keys
():
tmp_list
.
append
(
cpos_infos
[
key
])
attr_key_list
.
append
(
key
)
joined_cpos_infos
=
zip
(
all_cpos
,
*
tmp_list
)
attr_values_list
.
append
(
cpos_infos
[
key
])
attr_keys_list
.
append
(
key
)
attr_keys_list
.
extend
(
struct_attr_keys
)
attr_values_list
.
extend
(
struct_attr_values
)
joined_cpos_infos
=
zip
(
all_cpos
,
*
attr_values_list
)
dict_cpos_infos
=
{}
for
info
in
joined_cpos_infos
:
dict_cpos_infos
[
info
[
0
]]
=
dict
(
zip
(
attr_key_list
,
info
[
1
:]))
for
key
,
s_id
,
text_id
in
zip
(
dict_cpos_infos
.
keys
(),
tmp_s_info
,
tmp_text_info
):
dict_cpos_infos
[
key
].
update
(
s_id
)
dict_cpos_infos
[
key
].
update
(
text_id
)
dict_cpos_infos
[
info
[
0
]]
=
dict
(
zip
(
attr_keys_list
,
info
[
1
:]))
return
dict_cpos_infos
,
text_lookup
This diff is collapsed.
Click to expand it.
app/corpora/events.py
+
5
−
42
View file @
dbd580b3
...
...
@@ -4,10 +4,6 @@ from app.models import Corpus
from
flask
import
current_app
,
request
from
flask_login
import
current_user
,
login_required
from
.CQiWrapper.CQiWrapper
import
CQiWrapper
import
sys
import
gzip
import
zlib
import
json
'''
'
A dictionary containing lists of, with corpus ids associated, Socket.IO
...
...
@@ -47,46 +43,13 @@ def corpus_analysis(message):
room
=
request
.
sid
)
return
"""
Prepare and execute a query
"""
corpus
=
'
CORPUS
'
corpus
_name
=
'
CORPUS
'
query
=
(
message
[
'
query
'
])
query_subcorpus
=
'
Results
'
client
.
cqp_query
(
corpus
,
query_subcorpus
,
query
)
client
.
select_corpus
(
corpus_name
)
client
.
query_subcorpus
(
query
)
results
=
client
.
show_query_results
(
result_len
=
int
(
message
[
'
hits_per_page
'
]),
context_len
=
int
(
message
[
'
context
'
]))
data
=
{
'
matches
'
:
[],
'
cpos_lookup
'
:
{},
'
text_loopup
'
:
{}}
"""
Evaluate query results
"""
match_corpus
=
'
{}:{}
'
.
format
(
corpus
,
query_subcorpus
)
match_num
=
min
(
int
(
message
[
'
hits_per_page
'
]),
client
.
cqp_subcorpus_size
(
match_corpus
))
match_boundaries
=
zip
(
client
.
cqp_dump_subcorpus
(
match_corpus
,
0x10
,
0
,
match_num
-
1
),
client
.
cqp_dump_subcorpus
(
match_corpus
,
0x11
,
0
,
match_num
-
1
))
context
=
15
corpus_len
=
10000
for
match_start
,
match_end
in
match_boundaries
:
data
[
'
matches
'
].
append
({
'
lc
'
:
list
(
range
(
max
(
0
,
match_start
-
int
(
message
[
'
context
'
])),
match_start
)),
'
hit
'
:
list
(
range
(
match_start
,
match_end
+
1
)),
'
rc
'
:
list
(
range
(
match_end
+
1
,
min
(
corpus_len
,
match_end
+
1
+
int
(
message
[
'
context
'
]))))})
cpos_list
=
[]
for
match
in
data
[
'
matches
'
]:
cpos_list
+=
match
[
'
lc
'
]
+
match
[
'
hit
'
]
+
match
[
'
rc
'
]
cpos_list
=
list
(
set
(
cpos_list
))
lemma_list
=
client
.
cl_cpos2str
(
'
{}.lemma
'
.
format
(
corpus
),
cpos_list
)
pos_list
=
client
.
cl_cpos2str
(
'
{}.pos
'
.
format
(
corpus
),
cpos_list
)
simple_pos_list
=
client
.
cl_cpos2str
(
'
{}.simple_pos
'
.
format
(
corpus
),
cpos_list
)
s_id_list
=
client
.
cl_cpos2struc
(
'
{}.s
'
.
format
(
corpus
),
cpos_list
)
text_id_list
=
client
.
cl_cpos2struc
(
'
{}.text
'
.
format
(
corpus
),
cpos_list
)
word_list
=
client
.
cl_cpos2str
(
'
{}.word
'
.
format
(
corpus
),
cpos_list
)
for
cpos
,
lemma
,
pos
,
simple_pos
,
s_id
,
text_id
,
word
in
zip
(
cpos_list
,
lemma_list
,
pos_list
,
simple_pos_list
,
s_id_list
,
text_id_list
,
word_list
):
data
[
'
cpos_lookup
'
][
cpos
]
=
{
'
lemma
'
:
lemma
,
'
pos
'
:
pos
,
'
simple_pos
'
:
simple_pos
,
'
s_id
'
:
s_id
,
'
text_id
'
:
text_id
,
'
word
'
:
word
}
text_author_list
=
client
.
cl_struc2str
(
'
{}.text_author
'
.
format
(
corpus
),
text_id_list
)
text_publishing_year_list
=
client
.
cl_struc2str
(
'
{}.text_publishing_year
'
.
format
(
corpus
),
text_id_list
)
text_title_list
=
client
.
cl_struc2str
(
'
{}.text_title
'
.
format
(
corpus
),
text_id_list
)
for
text_id
,
text_author
,
text_publishing_year
,
text_title
in
zip
(
text_id_list
,
text_author_list
,
text_publishing_year_list
,
text_title_list
):
data
[
'
text_loopup
'
][
text_id
]
=
{
'
author
'
:
text_author
,
'
publishing_year
'
:
text_publishing_year
,
'
title
'
:
text_title
}
socketio
.
emit
(
'
corpus_analysis
'
,
data
,
room
=
request
.
sid
)
socketio
.
emit
(
'
corpus_analysis
'
,
results
,
room
=
request
.
sid
)
def
corpus_analysis_session_handler
(
app
,
corpus_id
,
session_id
):
...
...
This diff is collapsed.
Click to expand it.
app/templates/corpora/analyse_corpus.html.j2
+
5
−
4
View file @
dbd580b3
...
...
@@ -182,6 +182,7 @@
});
socket.on("corpus_analysis", function(message) {
console.log(message);
var matchElement;
var matchTextTitlesElement;
var matchLeftContextElement;
...
...
@@ -234,7 +235,7 @@
matchHitElement.append(tokenElement);
matchHitElement.append(document.createTextNode(" "));
tokenElements.push(tokenElement);
textTitles.add(result["text_loo
p
up"][token["text
_id
"]]["title"]);
textTitles.add(result["text_loo
k
up"][token["text"]]["title"]);
}
matchTextTitlesElement.innerText = [...textTitles].join(",");
matchElement.append(matchHitElement);
...
...
@@ -274,9 +275,9 @@
simple_pos: ${token["simple_pos"]}
</td>
<td class="left-align">
Title: ${result["text_loo
p
up"][token["text
_id
"]]["title"]}<br>
Author: ${result["text_loo
p
up"][token["text
_id
"]]["title"]}<br>
Publishing year: ${result["text_loo
p
up"][token["text
_id
"]]["publishing_year"]}
Title: ${result["text_loo
k
up"][token["text"]]["title"]}<br>
Author: ${result["text_loo
k
up"][token["text"]]["title"]}<br>
Publishing year: ${result["text_loo
k
up"][token["text"]]["publishing_year"]}
</td>
</tr>
</table>`,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment