From 180ebffbe695b375faa8a12d352342f10d507468 Mon Sep 17 00:00:00 2001 From: Christopher Lenke <christopher.lenke@uni-bielefeld.de> Date: Mon, 23 Sep 2024 18:39:42 +0200 Subject: [PATCH] 23.09.2024 --- ..._matching_d_pg_text__text__text__text_.txt | 402 ++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 Procedures/pattern_matching_d_pg_text__text__text__text_.txt diff --git a/Procedures/pattern_matching_d_pg_text__text__text__text_.txt b/Procedures/pattern_matching_d_pg_text__text__text__text_.txt new file mode 100644 index 0000000..aa30892 --- /dev/null +++ b/Procedures/pattern_matching_d_pg_text__text__text__text_.txt @@ -0,0 +1,402 @@ +-- DROP PROCEDURE schema_xyz.pattern_matching_d_pg(text, text, text, text); + +CREATE OR REPLACE PROCEDURE schema_xyz.pattern_matching_d_pg(table_in text, table_out text, table_out_ii text, process_name text) + LANGUAGE plpgsql +AS $procedure$ + +#variable_conflict use_variable +DECLARE + +mc cursor is select /*+ parallel(24) */ pattern_nf, pattern_city, pid, uid_ from schema_xyz.pattern_ges_like; +mc2 cursor is SELECT /*+ parallel(24) */ pattern_nf, pattern_city, pid, uid_ from schema_xyz.pattern_ges_like where pattern_city='%'; +ac cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte where city is not null and not address_full=address_full_uf; +ac2 cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte where city is null and not address_full=address_full_uf; +ac3 cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte_II where city is not null and not address_full=address_full_uf; +ac4 cursor is SELECT /*+ parallel(24) */ address_full, address_full_uf, city, adr_id from schema_xyz.zzz_unkodierte_II where city is null and not address_full=address_full_uf; +ac5 CURSOR IS select /*+ parallel(24) */ address_full, city, adr_id from schema_xyz.zzz_unkodierte_I where city is not NULL; +ac6 CURSOR IS select /*+ parallel(24) */ address_full, city, adr_id from schema_xyz.zzz_unkodierte_I where city is NULL; + +test_str varchar; + + muster RECORD; + adresse RECORD; + +Begin + +raise notice 'prozess name: %s', prozess_name ; +raise notice 'table_out: %s', table_out ; + +execute 'drop table if exists '||table_out||''; +EXECUTE 'create table '||table_out||'(address_full varchar not null, pattern_nf varchar not null, city varchar, +pattern_city varchar, adr_id numeric not null, pid numeric not null, uid_ numeric not null, reason varchar)'; + + +/* +++++++ If city is not null, everything can be assigned as usual (sample comparison for address and city) +++++++ */ + + + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 1: city not null'', '''||current_timestamp::varchar||''''; +commit; + For muster in mc + + loop + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' ( + select /*+ parallel(24) */ address_full, '''||muster.pattern_nf||''', city, + '''||muster.pattern_city||''', adr_id, '||muster.pid||', '||muster.uid_||', ''normal'' + from '||table_in||' + where city is not null + and address_full like '''||muster.pattern_nf||''' and city like '''||muster.pattern_city||''')'; + commit; + + END LOOP; + + +/* +++++++ If city is null, only the patterns with pattern_city='%' are unproblematic, others should not be used. This case does not occur with WoS. */ + + +/*############################## schema_xyz.process_info ###############################*/ +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 2: city null'', '''||current_timestamp::varchar||''''; +commit; + + FOR muster in mc2 + + LOOP + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' ( + select /*+ parallel(24) */ address_full, '''||muster.pattern_nf||''', city, + '''||muster.pattern_city||''', adr_id, '||muster.pid||', '||muster.uid_||', ''normal'' + from '||table_in||' + where city is null + and address_full like '''||muster.pattern_nf||''')'; + commit; + + END LOOP; + + +/* +++++++ Additional step for pattern_matching: * all those that are not yet coded and have 'UNIV' in the address * are rotated again and then compared with the patterns. Intermediate table for this: schema_xyz.zzz_unkodierte. +++++++ */ + +execute 'drop index if exists schema_xyz.i_result_ii_adr_id'; +execute 'drop index if exists schema_xyz.i_result_iii_adr_id'; +execute 'drop index if exists schema_xyz.i_result_iii_uid_'; +execute 'drop index if exists schema_xyz.i_result_iii_address_full'; +execute 'drop index if exists schema_xyz.i_result_iii_city'; +commit; + +execute 'create index i_result_ii_adr_id on schema_xyz.result_II(adr_id)'; +commit; +execute 'create index i_result_iii_adr_id on schema_xyz.result_III(adr_id)'; +commit; +execute 'create index i_result_iii_uid_ on schema_xyz.result_III(uid_)'; +commit; +execute 'create index i_result_iii_address_full on schema_xyz.result_III(address_full)'; +commit; +execute 'create index i_result_iii_city on schema_xyz.result_III(city)'; +commit; + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...indexes on result_iii done'', '''||current_timestamp::varchar||''''; +commit; + + +execute 'drop table if exists schema_xyz.zzz_unkodierte'; +EXECUTE 'create table schema_xyz.zzz_unkodierte as ( +select adr_id,address_full,city, '''' as address_full_uf +from schema_xyz.result_II +where not adr_id in (select adr_id from schema_xyz.result_III) +and address_full like ''%,%UNIV%'')'; +commit; +execute 'create index on schema_xyz.zzz_unkodierte(address_full)'; +commit; +execute 'create index on schema_xyz.zzz_unkodierte(address_full_uf)'; +commit; +execute 'create index on schema_xyz.zzz_unkodierte(city)'; +commit; +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''......creation of schema_xyz.zzz_unkodierte done'', '''||current_timestamp::varchar||''''; +commit; + + +EXECUTE 'update schema_xyz.zzz_unkodierte set address_full_uf=schema_xyz.univ_first_pg(address_full)'; +commit; + +EXECUTE 'delete from schema_xyz.zzz_unkodierte where address_full_uf is null'; +commit; + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''univ_first done'', '''||current_timestamp::varchar||''''; +commit; + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Additional step uncoded: turn so that the UNIV part is at the front'', '''||current_timestamp::varchar||''''; +commit; +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 1: city not null'', '''||current_timestamp::varchar||''''; +commit; + + for adresse in ac + + LOOP + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' ( + select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', + pattern_nf, '''||adresse.CITY||''', + pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''univ first'' + from schema_xyz.pattern_ges_like + where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf + and upper('''||adresse.CITY||''') LIKE pattern_city))'; + commit; + + + END LOOP; + + +/* If city is null, only the patterns with pattern_city='%' * are unproblematic, others should not be used. + * This case does not occur with WoS. */ + + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 2: city null'', '''||current_timestamp::varchar||''''; +commit; + + for adresse in ac2 + + LOOP + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' ( + select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', pattern_nf, null, + pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''univ first'' + from (select * from schema_xyz.pattern_ges_like where pattern_city=''%'') a + where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf))'; + commit; + + + END LOOP; + + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''End of additional step uncoded.'', '''||current_timestamp::varchar||''''; +commit; + +/*############################## PROZESS_INFO ###############################*/ +execute 'insert into schema_xyz.process_info select '''||process_name||''', ''Start additional step uncoded: all NF patterns ending in .% can also end directly (without .%)'', '''||current_timestamp::varchar||''''; +commit; + +execute 'drop table if exists schema_xyz.zzz_unkodierte_I'; + + +execute 'create table schema_xyz.zzz_unkodierte_I as ( +select a.adr_id,a.address_full,a.city +from schema_xyz.result_II a +left join schema_xyz.result_iii b +on a.adr_id = b.adr_id +where b.adr_id is null)'; +commit; +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''......creation of schema_xyz.zzz_unkodierte_I done'', '''||current_timestamp::varchar||''''; +commit; + + +/*############################## PROZESS_INFO ###############################*/ + + +execute 'drop table if exists schema_xyz.pgs_comma'; +execute 'create table schema_xyz.pgs_comma as (select pid, uid_, +substr(pattern_nf, 1, length(pattern_nf)-2) as pattern_nf, pattern_city +from schema_xyz.PATTERN_GES_LIKE where pattern_nf like ''%,\%'')'; +execute 'drop index if exists schema_xyz.i_pgs_comma_pattern_nf'; +execute 'create index on schema_xyz.pgs_comma(pattern_nf)'; +commit; +execute 'drop index if exists schema_xyz.i_pgs_comma_pattern_city'; +execute 'create index on schema_xyz.pgs_comma(pattern_city)'; +commit; + + +execute 'insert into schema_xyz.process_info select '''||process_name||''', ''Start Loop 1: city not null'', '''||current_timestamp::varchar||''''; +commit; + + for adresse in ac5 + + LOOP + + execute 'insert /*+ parallel(24) */ into '||table_out||' ( + select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', + pattern_nf, '''||adresse.CITY||''', + pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''pattern without comma'' + from schema_xyz.pgs_comma a + where (upper('''||adresse.ADDRESS_FULL||''') LIKE pattern_nf + and upper('''||adresse.CITY||''') LIKE pattern_city))'; + commit; + + + END LOOP; + + + + +/*############################## PROZESS_INFO ###############################*/ +execute 'insert into schema_xyz.process_info select '''||process_name||''', ''Start Loop 2: city null'', '''||current_timestamp::varchar||''''; +commit; + + for adresse in ac6 + + LOOP + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', pattern_nf, null, + pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''pattern without comma'' + from (select * from schema_xyz.pgs_comma where pattern_city=''%'') a + where (upper('''||adresse.ADDRESS_FULL||''') LIKE pattern_nf))'; + commit; + + + END LOOP; + +drop index if exists zzz_unkodierte_i_address_full_idx; +drop index if exists zzz_unkodierte_i_address_full_uf_idx; +drop index if exists zzz_unkodierte_i_city_idx; +drop table if exists zzz_unkodierte_i; +execute 'drop table if exists schema_xyz.pgs_comma'; + +/*############################## PROZESS_INFO ###############################*/ +execute 'insert into schema_xyz.process_info select '''||process_name||''', ''End Additional step uncoded, NF pattern end.'', '''||current_timestamp::varchar||''''; +commit; + +/* +++++++ Additional step for pattern_matching: compare all those that * have not yet been coded with the original address again, * in the event that very special patterns were present; for example, patterns that contain spelling errors, etc. +Intermediate table for this: schema_xyz.zzz_unkodierte_II. +++++++ */ +--EXECUTE 'truncate table schema_xyz.zzz_unkodierte_II'; +execute 'drop table if exists schema_xyz.zzz_unkodierte_II'; + +/*EXECUTE 'create table schema_xyz.zzz_unkodierte_II(address_full varchar not null, city varchar, +adr_id varchar not null, address_full_original varchar, address_full_uf varchar)';*/ + +execute 'drop index if exists schema_xyz.i_result_ii_adrid'; +execute 'create index i_result_ii_adrid on schema_xyz.result_II(adr_id)'; +commit; + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...adr_id index on result_ii created'', '''||current_timestamp::varchar||''''; +commit; + +execute 'drop index if exists schema_xyz.i_result_i_address_full'; +execute 'create index i_result_i_address_full on schema_xyz.result_I(address_full)'; + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...address full index on result_i created'', '''||current_timestamp::varchar||''''; +commit; + + +execute 'drop table if exists schema_xyz.zzz_unkodierte_II_pre'; +execute 'create table schema_xyz.zzz_unkodierte_II_pre as ( +select a.adr_id, a.address_full +from schema_xyz.result_ii a +left join schema_xyz.result_iii b +on a.adr_id=b.adr_id +where b.adr_id is null)'; +commit; + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...create zzz_II_pre done'', '''||current_timestamp::varchar||''''; +commit; + +execute 'drop index if exists schema_xyz.i_zzz_ii_pre'; +execute 'create index i_zzz_ii_pre on schema_xyz.zzz_unkodierte_II_pre(address_full)'; +commit; + +execute 'drop table if exists schema_xyz.zzz_unkodierte_II'; +execute 'create table schema_xyz.zzz_unkodierte_II as ( +select b.adr_id,a.address_full,a.city,upper(address_full_original) as address_full_original, +upper(a.address_full_original) as address_full_uf +from schema_xyz.result_i a +join schema_xyz.zzz_unkodierte_II_pre b +on a.address_full = b.address_full)'; +commit; + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...create zzz_II done'', '''||current_timestamp::varchar||''''; +commit; + + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''...indexes zzz_II done'', '''||current_timestamp::varchar||''''; +commit; + +EXECUTE 'update schema_xyz.zzz_unkodierte_II +set address_full_uf=replace(address_full_uf,'''''''','''')'; +commit; + + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Additional step unencoded: Use originals'', '''||current_timestamp::varchar||''''; +commit; + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 1: city not null'', '''||current_timestamp::varchar||''''; +commit; + + for adresse in ac3 + + LOOP + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' (select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', + pattern_nf, '''||adresse.CITY||''', + pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''original'' + from schema_xyz.pattern_ges_like + where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf + and upper('''||adresse.CITY||''') LIKE pattern_city))'; + commit; + + + END LOOP; + + +/* If city is null, only the patterns with pattern_city='%' are unproblematic, others should not be used. This case does not occur with WoS. */ + + + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''Loop 2: city null'', '''||current_timestamp::varchar||''''; +commit; + + for adresse in ac4 + + LOOP + + EXECUTE 'insert /*+ parallel(24) */ into '||table_out||' ( + select /*+ parallel(24) */ '''||adresse.ADDRESS_FULL||''', pattern_nf, null, + pattern_city, '||adresse.ADR_ID||', pid, uid_ , ''original'' + from (select * from schema_xyz.pattern_ges_like where pattern_city=''%'') a + where (upper('''||adresse.ADDRESS_FULL_UF||''') LIKE pattern_nf))'; + commit; + + + END LOOP; + + + +/*############################## schema_xyz.process_info ###############################*/ + +EXECUTE 'insert into schema_xyz.process_info select '''||Process_name||''', ''End of additional step unencoded: Use originals'', '''||current_timestamp::varchar||''''; +commit; + + +/* +++++++ Grouping for the intermediate result +++++++ */ + +EXECUTE 'drop table if exists '||table_out_II||''; + +EXECUTE 'create table '||table_out_II||' as (select address_full, city, + adr_id, uid_ + from '||table_out||' + group by address_full, city, + adr_id, uid_)'; +commit; + + +/*############################## schema_xyz.process_info ###############################*/ +EXECUTE 'insert into schema_xyz.process_info select '''||process_name||''', ''End pattern_matching_d'', '''||current_timestamp::varchar||''''; +commit; + + +END; + +$procedure$ +; -- GitLab