Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • davidschwab/material-science-word-embeddings
1 result
Show changes
Commits on Source (2)
Source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -85,7 +85,6 @@ if __name__ == '__main__':
elements_counts = Counter(elements)
formulas_counts = Counter(formulas)
regex_formulas_counts = Counter(regex_formulas)
#print(pd.Series(elements).value_counts())
# create df with element and frequency from Counter
#df = pd.DataFrame.from_dict(elements_counts, orient='index').reset_index()
#df = df.rename(columns={"index": "element", 0: "count_per_file"})
......@@ -109,7 +108,6 @@ if __name__ == '__main__':
except Exception as e:
print(e)
# calculate total count of element
# TODO: divide to seperate df for element and formula
result.to_csv("elements_result.csv", index=False)
result2.to_csv("formulas_result.csv", index=False)
result3.to_csv("regex_formulas_result.csv", index=False)
......@@ -136,14 +134,62 @@ if __name__ == '__main__':
out = {}
# keep the keys in sorted order
sorted_keys = sorted(co_occ_matrix)
# now for each key in the list
for i in range(len(sorted_keys) - 1):
out[sorted_keys[i]] = []
for j in range(len(sorted_keys) - 1 ):
co_occ = sum([a * b for a, b in zip(co_occ_matrix[sorted_keys[i]], co_occ_matrix[sorted_keys[j]])])
if co_occ > 0:
out[sorted_keys[i]].append((sorted_keys[j], co_occ))
print(out)
out[sorted_keys[i]].append([sorted_keys[j]])
def min_distance(text, w1, w2):
index1 = None
index2 = None
distance = 1000000
for idx, word in enumerate(text[0].split(" ")):
if word == w1:
if index2 is not None:
distance = min(distance, abs(idx - index2) - 1)
index1 = idx
if word == w2:
if index1 is not None:
distance = min(distance, abs(idx - index1) - 1)
index2 = idx
if index1 is not None and index2 is not None:
return distance
return -1
for k, v in out.items():
for i in range(len(v)):
if set(co_occ_dict[k]).intersection(co_occ_dict[v[i][0]]):
iterator = iter(set(co_occ_dict[k]).intersection(co_occ_dict[v[i][0]]))
v[i].append([])
for r in range(len(set(co_occ_dict[k]).intersection(co_occ_dict[v[i][0]]))):
matching_file = next(iterator, None)
with open('./corpus/' + str(matching_file)) as f:
text = f.readlines()
min_dist = min_distance(text, k, v[i][0])
v[i][1].append(min_dist)
for k, v in out.items():
to_remove = []
for i in range(len(v)):
if v[i][1][0] == -1:
to_remove.append(i)
for index in sorted(to_remove, reverse=True):
del v[index]
def Convert(a):
it = iter(a)
dct = dict(zip(it, it))
return dct
for k, v in out.items():
for i in range(len(v)):
#v[i] = dict.fromkeys(v[i], v[i][1])
v[i][1] = sorted(v[i][1])
v[i] = Convert(v[i])
with open('elements_co_occ.csv', 'w') as f: # You will need 'wb' mode in Python 2.x
w = csv.DictWriter(f, out.keys())
w.writeheader()
......@@ -153,8 +199,6 @@ if __name__ == '__main__':
json.dump(out, f)
result = json.dumps(out)
print(result)