#20

Created common compound list from wiki, converted to lowercase lists common compounds in corpus counts occurence of compound per file and in total

#20
cb3614d8 · Marie Kl · 7909f0fe · cb3614d8 · cb3614d8
Commit cb3614d8 authored 2 years ago by Marie Kl
--- a/create_wiki_compounds.py
+++ b/create_wiki_compounds.py
+import pandas as pd
+
+def get_wiki_compounds():
+    compounds = pd.read_csv('./compounds/wiki_compounds_lc.csv')
+    #print(compounds.head())
+
+
+    most_common_compound_elements = compounds.synonyms.str.split(expand=True).stack().value_counts().reset_index()
+    most_common_compound_elements.columns = ['compound_element', 'frequency']
+    #print(most_common_compound_elements.head())
+
+
+    most_common_compound_endings = compounds.synonyms.str[-3:].value_counts().reset_index()
+    most_common_compound_endings.columns = ['compound_ending', 'frequency']
+    #print(set(most_common_compound_endings.compound_ending))
+
+    # We set the minimum of occurences to 10
+    most_common_compound_endings = most_common_compound_endings[most_common_compound_endings['frequency'] >= 10]
+    #print(most_common_compound_endings.head(30))
+
+    return most_common_compound_elements, most_common_compound_endings
\ No newline at end of file
--- a/get_compounds.py
+++ b/get_compounds.py
+import nltk
+
+from create_wiki_compounds import get_wiki_compounds
+import pandas as pd
+import string
+from pathlib import Path
+import re
+from tqdm import tqdm
+from collections import Counter
+from nltk.tokenize import word_tokenize
+nltk.download('punkt')
+
+
+class Document:
+    text = []
+    output_file_type: str = ".txt"
+    output_file_path: str
+
+    def __init__(self, file_path: str, most_common_elements: list, most_common_compound_endings: list):
+        self.mcce = most_common_compound_endings
+        self.mcelements = most_common_elements
+        self.elements = []
+        with open(file_path) as f:
+            self.text = f.read()
+        self.text = word_tokenize(self.text)
+
+    def check_for_compounds_endings(self):
+        for word in self.text:
+            if word.endswith(tuple(self.mcce)):
+                self.elements.append(word)
+        return self.elements
+
+    def check_for_compounds_elements(self):
+        for word in self.text:
+            if word in self.mcelements:
+                self.elements.append(word)
+        return self.elements
+
+
+
+CORPUS_FOLDER = Path("./corpus")
+
+if __name__ == '__main__':
+    most_common_elements, most_common_compound_endings = get_wiki_compounds()
+    # remove elements that are only one letter
+    most_common_elements = most_common_elements[most_common_elements['compound_element'].apply(lambda x: len(x) > 1)]
+
+    df_ = pd.DataFrame(columns=['element'])#, 'abbrev', 'files_found', 'count', 'compounds_cooccurence'])
+
+    for txt_path in tqdm(CORPUS_FOLDER.iterdir()):
+        if txt_path.name.endswith(".txt"):
+            try:
+                x = Document(str(txt_path), list(most_common_elements.compound_element), list(most_common_compound_endings.compound_ending))
+                # check if listed element is in text
+                elements = x.check_for_compounds_elements()
+                elements_counts = Counter(elements)
+                #print(pd.Series(elements).value_counts())
+                # create df with element and frequency from Counter
+                df = pd.DataFrame.from_dict(elements_counts, orient='index').reset_index()
+                df = df.rename(columns={"index": "element", 0: "count_per_file"})
+                df['file'] = txt_path.name
+                df_ = pd.concat([df_, df])
+                result = df_.groupby('element', as_index=False).agg(list)
+                #endings = x.check_for_compounds_endings()
+            except Exception as e:
+                print(e)
+    # calculate total count of element
+    result['total_count'] = result['count_per_file'].apply(sum)
+    result = result.sort_values('total_count')
+    print(result)
+    result.to_csv("compounds_found.csv", index=False)
+
+