From 413b6111df712972a09e8a8c87dc4b1436793a7c Mon Sep 17 00:00:00 2001
From: Patrick Jentsch <p.jentsch@uni-bielefeld.de>
Date: Mon, 3 Jul 2023 15:31:28 +0200
Subject: [PATCH] Implement fast boundary computation for ent and s s_attrs

---
 app/corpora/cqi_over_sio/extensions.py | 27 +++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/app/corpora/cqi_over_sio/extensions.py b/app/corpora/cqi_over_sio/extensions.py
index d92b6d5e..0ff166bf 100644
--- a/app/corpora/cqi_over_sio/extensions.py
+++ b/app/corpora/cqi_over_sio/extensions.py
@@ -99,12 +99,29 @@ def ext_corpus_static_data(corpus: str) -> Dict:
         static_corpus_data['corpus']['counts'][s_attr.name] = s_attr.size
         static_corpus_data['s_attrs'][s_attr.name] = {'lexicon': {}, 'values': None}
         static_corpus_data['values']['s_attrs'][s_attr.name] = {}
+        if s_attr.name in ['s', 'ent']:
+            cqi_corpus.query('Last', f'<{s_attr.name}> []* </{s_attr.name}>;')
+            cqi_subcorpus = cqi_corpus.subcorpora.get('Last')
+            first_match = 0
+            last_match = cqi_subcorpus.size - 1
+            match_boundaries = zip(
+                range(first_match, last_match + 1),
+                cqi_subcorpus.dump(cqi_subcorpus.fields['match'], first_match, last_match),
+                cqi_subcorpus.dump(cqi_subcorpus.fields['matchend'], first_match, last_match)
+            )
+            for id, lbound, rbound in match_boundaries:
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
+            cqi_subcorpus.drop()
         for id in range(0, s_attr.size):
-            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
-            lbound, rbound = s_attr.cpos_by_id(id)
-            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
-            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
-            static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
+            if s_attr.name not in ['s', 'ent']:
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id] = {}
+                lbound, rbound = s_attr.cpos_by_id(id)
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['bounds'] = [lbound, rbound]
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts'] = {}
+                static_corpus_data['s_attrs'][s_attr.name]['lexicon'][id]['counts']['token'] = rbound - lbound + 1
             if s_attr.name not in ['text', 's']:
                 continue
             cpos_range = range(lbound, rbound + 1)
-- 
GitLab