Skip to content
Snippets Groups Projects
Commit cb3614d8 authored by Marie Kl's avatar Marie Kl
Browse files

#20

Created common compound list from wiki, converted to lowercase
lists common compounds in corpus
counts occurence of compound per file and in total
parent 7909f0fe
No related branches found
No related tags found
No related merge requests found
Pipeline #20056 passed
import pandas as pd
def get_wiki_compounds():
compounds = pd.read_csv('./compounds/wiki_compounds_lc.csv')
#print(compounds.head())
most_common_compound_elements = compounds.synonyms.str.split(expand=True).stack().value_counts().reset_index()
most_common_compound_elements.columns = ['compound_element', 'frequency']
#print(most_common_compound_elements.head())
most_common_compound_endings = compounds.synonyms.str[-3:].value_counts().reset_index()
most_common_compound_endings.columns = ['compound_ending', 'frequency']
#print(set(most_common_compound_endings.compound_ending))
# We set the minimum of occurences to 10
most_common_compound_endings = most_common_compound_endings[most_common_compound_endings['frequency'] >= 10]
#print(most_common_compound_endings.head(30))
return most_common_compound_elements, most_common_compound_endings
\ No newline at end of file
import nltk
from create_wiki_compounds import get_wiki_compounds
import pandas as pd
import string
from pathlib import Path
import re
from tqdm import tqdm
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('punkt')
class Document:
text = []
output_file_type: str = ".txt"
output_file_path: str
def __init__(self, file_path: str, most_common_elements: list, most_common_compound_endings: list):
self.mcce = most_common_compound_endings
self.mcelements = most_common_elements
self.elements = []
with open(file_path) as f:
self.text = f.read()
self.text = word_tokenize(self.text)
def check_for_compounds_endings(self):
for word in self.text:
if word.endswith(tuple(self.mcce)):
self.elements.append(word)
return self.elements
def check_for_compounds_elements(self):
for word in self.text:
if word in self.mcelements:
self.elements.append(word)
return self.elements
CORPUS_FOLDER = Path("./corpus")
if __name__ == '__main__':
most_common_elements, most_common_compound_endings = get_wiki_compounds()
# remove elements that are only one letter
most_common_elements = most_common_elements[most_common_elements['compound_element'].apply(lambda x: len(x) > 1)]
df_ = pd.DataFrame(columns=['element'])#, 'abbrev', 'files_found', 'count', 'compounds_cooccurence'])
for txt_path in tqdm(CORPUS_FOLDER.iterdir()):
if txt_path.name.endswith(".txt"):
try:
x = Document(str(txt_path), list(most_common_elements.compound_element), list(most_common_compound_endings.compound_ending))
# check if listed element is in text
elements = x.check_for_compounds_elements()
elements_counts = Counter(elements)
#print(pd.Series(elements).value_counts())
# create df with element and frequency from Counter
df = pd.DataFrame.from_dict(elements_counts, orient='index').reset_index()
df = df.rename(columns={"index": "element", 0: "count_per_file"})
df['file'] = txt_path.name
df_ = pd.concat([df_, df])
result = df_.groupby('element', as_index=False).agg(list)
#endings = x.check_for_compounds_endings()
except Exception as e:
print(e)
# calculate total count of element
result['total_count'] = result['count_per_file'].apply(sum)
result = result.sort_values('total_count')
print(result)
result.to_csv("compounds_found.csv", index=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment