From 180ebffbe695b375faa8a12d352342f10d507468 Mon Sep 17 00:00:00 2001
From: Christopher Lenke <christopher.lenke@uni-bielefeld.de>
Date: Mon, 23 Sep 2024 18:39:42 +0200
Subject: [PATCH] 23.09.2024

---
 ..._matching_d_pg_text__text__text__text_.txt | 402 ++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 Procedures/pattern_matching_d_pg_text__text__text__text_.txt

diff --git a/Procedures/pattern_matching_d_pg_text__text__text__text_.txt b/Procedures/pattern_matching_d_pg_text__text__text__text_.txt
new file mode 100644
index 0000000..aa30892
--- /dev/null
+++ b/Procedures/pattern_matching_d_pg_text__text__text__text_.txt
@@ -0,0 +1,402 @@
+-- DROP PROCEDURE schema_xyz.pattern_matching_d_pg(text, text, text, text);
+
+CREATE OR REPLACE PROCEDURE schema_xyz.pattern_matching_d_pg(table_in text, table_out text, table_out_ii text, process_name text)
+ LANGUAGE plpgsql
+AS $procedure$
+	
+#variable_conflict use_variable	
+DECLARE 
+
+mc cursor is select /*+ parallel(24) */ pattern_nf, pattern_city, pid, uid_ from schema_xyz.pattern_ges_like;
+mc2 cursor is SELECT /*+ parallel(24) */ pattern_nf, pattern_city, pid, uid_ from schema_xyz.pattern_ges_like where pattern_city='%';
+ac cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte where city is not null and not address_full=address_full_uf;
+ac2 cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte where city is null and not address_full=address_full_uf;
+ac3 cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte_II where city is not null and not address_full=address_full_uf;
+ac4 cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte_II where city is null and not address_full=address_full_uf;
+ac5 CURSOR IS select /*+ parallel(24) */ address_full, city, adr_id from schema_xyz.zzz_unkodierte_I where city is not NULL;
+ac6 CURSOR IS select /*+ parallel(24) */ address_full, city, adr_id from schema_xyz.zzz_unkodierte_I where city is NULL;      
+
+test_str varchar;
+
+  muster RECORD;
+  adresse RECORD;
+
+Begin
+	
+raise notice 'prozess name: %s', prozess_name ;
+raise notice 'table_out: %s', table_out ;
+
+execute 'drop table if exists '||table_out||'';
+EXECUTE 'create table '||table_out||'(address_full varchar not null, pattern_nf varchar not null, city varchar, 
+pattern_city varchar, adr_id numeric not null, pid numeric not null, uid_ numeric not null, reason varchar)';
+
+
+/* +++++++ If city is not null, everything can be assigned as usual (sample comparison for address and city) +++++++ */
+    
+    
+
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 1: city not null'', '''||current_timestamp::varchar||'''';
+commit;
+	For muster in mc 
+
+          loop
+         
+          EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (
+          select /*+ parallel(24) */ address_full, '''||muster.pattern_nf||''', city, 
+          '''||muster.pattern_city||''', adr_id, '||muster.pid||', '||muster.uid_||', ''normal'' 
+          from '||table_in||' 
+          where city is not null 
+          and address_full like '''||muster.pattern_nf||''' and city like '''||muster.pattern_city||''')';
+          commit;
+
+          END LOOP;
+
+
+/* +++++++ If city is null, only the patterns with pattern_city='%' are unproblematic, others should not be used. This case does not occur with WoS. */
+
+       
+/*############################## schema_xyz.process_info ###############################*/
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 2: city null'', '''||current_timestamp::varchar||'''';
+commit;
+
+    FOR muster in mc2
+    
+          LOOP
+
+          EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (
+          select /*+ parallel(24) */ address_full, '''||muster.pattern_nf||''', city, 
+          '''||muster.pattern_city||''', adr_id, '||muster.pid||', '||muster.uid_||', ''normal'' 
+          from '||table_in||' 
+          where city is null 
+          and address_full like '''||muster.pattern_nf||''')';
+          commit;
+
+    END LOOP;
+
+
+/* +++++++ Additional step for pattern_matching: * all those that are not yet coded and have 'UNIV' in the address * are rotated again and then compared with the patterns. Intermediate table for this: schema_xyz.zzz_unkodierte. +++++++ */
+
+execute 'drop index if exists schema_xyz.i_result_ii_adr_id';
+execute 'drop index if exists schema_xyz.i_result_iii_adr_id';
+execute 'drop index if exists schema_xyz.i_result_iii_uid_';
+execute 'drop index if exists schema_xyz.i_result_iii_address_full';
+execute 'drop index if exists schema_xyz.i_result_iii_city';
+commit;
+
+execute 'create index i_result_ii_adr_id on schema_xyz.result_II(adr_id)';
+commit;
+execute 'create index i_result_iii_adr_id on schema_xyz.result_III(adr_id)';
+commit;
+execute 'create index i_result_iii_uid_ on schema_xyz.result_III(uid_)';
+commit;
+execute 'create index i_result_iii_address_full on schema_xyz.result_III(address_full)';
+commit;
+execute 'create index i_result_iii_city on schema_xyz.result_III(city)';
+commit;
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...indexes on result_iii done'', '''||current_timestamp::varchar||'''';
+commit;
+   
+   
+execute 'drop table if exists schema_xyz.zzz_unkodierte';
+EXECUTE 'create table schema_xyz.zzz_unkodierte as (
+select adr_id,address_full,city, '''' as address_full_uf   
+from schema_xyz.result_II 
+where not adr_id in (select adr_id from schema_xyz.result_III)
+and address_full like ''%,%UNIV%'')';
+commit;
+execute 'create index on schema_xyz.zzz_unkodierte(address_full)';
+commit;
+execute 'create index on schema_xyz.zzz_unkodierte(address_full_uf)';
+commit;
+execute 'create index on schema_xyz.zzz_unkodierte(city)';
+commit;
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''......creation of schema_xyz.zzz_unkodierte done'', '''||current_timestamp::varchar||'''';
+commit;
+
+
+EXECUTE 'update schema_xyz.zzz_unkodierte set address_full_uf=schema_xyz.univ_first_pg(address_full)';
+commit;
+
+EXECUTE 'delete from schema_xyz.zzz_unkodierte where address_full_uf is null';
+commit;
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''univ_first done'', '''||current_timestamp::varchar||'''';
+commit;
+
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Additional step uncoded: turn so that the UNIV part is at the front'', '''||current_timestamp::varchar||'''';
+commit;
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 1: city not null'', '''||current_timestamp::varchar||'''';
+commit;
+                      
+    for adresse in ac
+    
+          LOOP
+       
+             EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (
+             select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', 
+             pattern_nf, '''||adresse.CITY||''', 
+             pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''univ first''
+             from schema_xyz.pattern_ges_like 
+             where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf 
+             and upper('''||adresse.CITY||''') LIKE pattern_city))';
+             commit;
+
+   
+          END LOOP;
+
+
+/* If city is null, only the patterns with pattern_city='%' * are unproblematic, others should not be used. 
+ * This case does not occur with WoS. */
+       
+
+/*############################## schema_xyz.process_info ###############################*/
+       
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 2: city null'', '''||current_timestamp::varchar||'''';
+commit;
+
+      for adresse in ac2
+      
+          LOOP
+      
+             EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (
+			 select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', pattern_nf, null, 
+             pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''univ first''
+             from (select * from schema_xyz.pattern_ges_like where pattern_city=''%'') a
+             where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf))';
+             commit;
+
+   
+          END LOOP;
+
+
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''End of additional step uncoded.'', '''||current_timestamp::varchar||'''';
+commit;
+
+/*############################## PROZESS_INFO ###############################*/
+execute 'insert into schema_xyz.process_info select '''||process_name||''', ''Start additional step uncoded: all NF patterns ending in .% can also end directly (without .%)'', '''||current_timestamp::varchar||'''';
+commit;
+
+execute 'drop table if exists schema_xyz.zzz_unkodierte_I';
+
+
+execute 'create table schema_xyz.zzz_unkodierte_I as (
+select a.adr_id,a.address_full,a.city 
+from schema_xyz.result_II a 
+left join schema_xyz.result_iii b 
+on a.adr_id = b.adr_id
+where b.adr_id is null)';
+commit;
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''......creation of schema_xyz.zzz_unkodierte_I done'', '''||current_timestamp::varchar||'''';
+commit;
+
+
+/*############################## PROZESS_INFO ###############################*/
+
+
+execute 'drop table if exists schema_xyz.pgs_comma';
+execute 'create table schema_xyz.pgs_comma as (select pid, uid_, 
+substr(pattern_nf, 1, length(pattern_nf)-2) as pattern_nf, pattern_city  
+from schema_xyz.PATTERN_GES_LIKE where pattern_nf like ''%,\%'')';
+execute 'drop index if exists schema_xyz.i_pgs_comma_pattern_nf';
+execute 'create index on schema_xyz.pgs_comma(pattern_nf)';
+commit;
+execute 'drop index if exists schema_xyz.i_pgs_comma_pattern_city';
+execute 'create index on schema_xyz.pgs_comma(pattern_city)';
+commit;
+
+
+execute 'insert into schema_xyz.process_info select '''||process_name||''', ''Start Loop 1: city not null'', '''||current_timestamp::varchar||'''';
+commit;
+                      
+    for adresse in ac5
+   
+          LOOP
+       
+             execute 'insert /*+ parallel(24) */ into '||table_out||' (
+             select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', 
+             pattern_nf, '''||adresse.CITY||''', 
+             pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''pattern without comma''
+             from schema_xyz.pgs_comma a
+             where (upper('''||adresse.ADDRESS_FULL||''') LIKE pattern_nf   
+             and upper('''||adresse.CITY||''') LIKE pattern_city))';
+             commit;
+          
+   
+          END LOOP;                  
+
+
+ 
+
+/*############################## PROZESS_INFO ###############################*/       
+execute 'insert into schema_xyz.process_info select '''||process_name||''', ''Start Loop 2: city null'', '''||current_timestamp::varchar||'''';
+commit; 
+
+      for adresse in ac6
+   
+          LOOP
+          
+             EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', pattern_nf, null, 
+             pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''pattern without comma''
+             from (select * from schema_xyz.pgs_comma where pattern_city=''%'') a
+             where (upper('''||adresse.ADDRESS_FULL||''') LIKE pattern_nf))';
+             commit;
+          
+   
+          END LOOP;                  
+
+drop index if exists zzz_unkodierte_i_address_full_idx;
+drop index if exists zzz_unkodierte_i_address_full_uf_idx;
+drop index if exists zzz_unkodierte_i_city_idx;
+drop table if exists zzz_unkodierte_i;
+execute 'drop table if exists schema_xyz.pgs_comma';
+
+/*############################## PROZESS_INFO ###############################*/
+execute 'insert into schema_xyz.process_info select '''||process_name||''', ''End Additional step uncoded, NF pattern end.'', '''||current_timestamp::varchar||'''';
+commit;
+
+/* +++++++ Additional step for pattern_matching: compare all those that * have not yet been coded with the original address again, * in the event that very special patterns were present; for example, patterns that contain spelling errors, etc. 
+Intermediate table for this: schema_xyz.zzz_unkodierte_II. +++++++ */
+--EXECUTE 'truncate table schema_xyz.zzz_unkodierte_II';
+execute 'drop table if exists schema_xyz.zzz_unkodierte_II';
+
+/*EXECUTE 'create table schema_xyz.zzz_unkodierte_II(address_full varchar not null, city varchar, 
+adr_id varchar not null, address_full_original varchar, address_full_uf varchar)';*/
+
+execute 'drop index if exists schema_xyz.i_result_ii_adrid';
+execute 'create index i_result_ii_adrid on schema_xyz.result_II(adr_id)';
+commit;
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...adr_id index on result_ii created'', '''||current_timestamp::varchar||'''';
+commit;
+ 
+execute 'drop index if exists schema_xyz.i_result_i_address_full';
+execute 'create index i_result_i_address_full on schema_xyz.result_I(address_full)';
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...address full index on result_i created'', '''||current_timestamp::varchar||'''';
+commit;
+
+
+execute 'drop table if exists schema_xyz.zzz_unkodierte_II_pre';
+execute 'create table schema_xyz.zzz_unkodierte_II_pre as (
+select a.adr_id, a.address_full 
+from schema_xyz.result_ii a
+left join schema_xyz.result_iii b
+on a.adr_id=b.adr_id
+where b.adr_id is null)';
+commit;
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...create zzz_II_pre done'', '''||current_timestamp::varchar||'''';
+commit;
+
+execute 'drop index if exists schema_xyz.i_zzz_ii_pre';
+execute 'create index i_zzz_ii_pre on schema_xyz.zzz_unkodierte_II_pre(address_full)';
+commit;
+
+execute 'drop table if exists schema_xyz.zzz_unkodierte_II';
+execute 'create table schema_xyz.zzz_unkodierte_II as (
+select b.adr_id,a.address_full,a.city,upper(address_full_original) as address_full_original, 
+upper(a.address_full_original) as address_full_uf  
+from schema_xyz.result_i a 
+join schema_xyz.zzz_unkodierte_II_pre b
+on a.address_full = b.address_full)';
+commit;
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...create zzz_II done'', '''||current_timestamp::varchar||'''';
+commit;
+
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...indexes zzz_II done'', '''||current_timestamp::varchar||'''';
+commit;
+
+EXECUTE 'update schema_xyz.zzz_unkodierte_II 
+set address_full_uf=replace(address_full_uf,'''''''','''')';
+commit;
+
+
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Additional step unencoded: Use originals'', '''||current_timestamp::varchar||'''';
+commit;
+
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 1: city not null'', '''||current_timestamp::varchar||'''';
+commit;
+                      
+    for adresse in ac3
+    
+          LOOP 
+            
+             EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', 
+             pattern_nf, '''||adresse.CITY||''', 
+             pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''original''
+             from schema_xyz.pattern_ges_like 
+             where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf 
+             and upper('''||adresse.CITY||''') LIKE pattern_city))';
+             commit;
+
+   
+          END LOOP;
+          
+          
+/* If city is null, only the patterns with pattern_city='%' are unproblematic, others should not be used. This case does not occur with WoS. */
+
+       
+
+/*############################## schema_xyz.process_info ###############################*/
+       
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 2: city null'', '''||current_timestamp::varchar||'''';
+commit;
+
+      for adresse in ac4
+      
+          LOOP
+          
+             EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (
+			select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', pattern_nf, null, 
+             pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''original''
+             from (select * from schema_xyz.pattern_ges_like where pattern_city=''%'') a
+             where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf))';
+             commit;
+
+   
+          END LOOP;
+
+
+
+/*############################## schema_xyz.process_info ###############################*/
+
+EXECUTE 'insert into schema_xyz.process_info select '''||Process_name||''', ''End of additional step unencoded: Use originals'', '''||current_timestamp::varchar||'''';
+commit;
+
+
+/* +++++++ Grouping for the intermediate result +++++++ */
+
+EXECUTE 'drop table if exists '||table_out_II||'';
+
+EXECUTE 'create table '||table_out_II||' as (select address_full, city, 
+   adr_id, uid_
+   from '||table_out||'
+   group by  address_full, city, 
+   adr_id, uid_)';
+commit;
+
+
+/*############################## schema_xyz.process_info ###############################*/
+EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''End pattern_matching_d'', '''||current_timestamp::varchar||'''';
+commit;
+     
+     
+END;
+
+$procedure$
+;
-- 
GitLab