Skip to content
Snippets Groups Projects
Commit d770fea0 authored by Marie Kl's avatar Marie Kl
Browse files

#20

Created co-occurence matrix - file based per found element in corpus
parent cb3614d8
No related branches found
No related tags found
No related merge requests found
Pipeline #20097 passed
......@@ -9,6 +9,7 @@ from tqdm import tqdm
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import csv
class Document:
......@@ -67,7 +68,19 @@ if __name__ == '__main__':
# calculate total count of element
result['total_count'] = result['count_per_file'].apply(sum)
result = result.sort_values('total_count')
print(result)
#print(result)
result.to_csv("compounds_found.csv", index=False)
co_occ = result.drop('count_per_file', axis=1)
co_occ = co_occ.drop('total_count', axis=1)
# create dict for co-occurence matrix
co_occ_dict = dict(zip(co_occ.element, co_occ.file))
# create co-occurence matrix
values = sorted(set(e for v in co_occ_dict.values() for e in v))
co_occ_matrix = {k: [1 if value in v else 0 for value in values] for k, v in co_occ_dict.items()}
with open('co_occurence.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(values)
for k, v in co_occ_matrix.items():
writer.writerow([k, v])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment