Compare revisions

Marie Kl · Marie Kl · f37a246d · f37a246d
--- a/co_occurences.json
+++ b/co_occurences.json
--- a/get_compounds.py
+++ b/get_compounds.py
@@ -85,7 +85,6 @@ if __name__ == '__main__':
                elements_counts = Counter(elements)
                formulas_counts = Counter(formulas)
                regex_formulas_counts = Counter(regex_formulas)
-                #print(pd.Series(elements).value_counts())
                # create df with element and frequency from Counter
                #df = pd.DataFrame.from_dict(elements_counts, orient='index').reset_index()
                #df = df.rename(columns={"index": "element", 0: "count_per_file"})
@@ -109,7 +108,6 @@ if __name__ == '__main__':
            except Exception as e:
                print(e)
    # calculate total count of element
-    # TODO: divide to seperate df for element and formula
    result.to_csv("elements_result.csv", index=False)
    result2.to_csv("formulas_result.csv", index=False)
    result3.to_csv("regex_formulas_result.csv", index=False)
@@ -136,14 +134,62 @@ if __name__ == '__main__':
    out = {}
    # keep the keys in sorted order
    sorted_keys = sorted(co_occ_matrix)
-    # now for each key in the list
+
    for i in range(len(sorted_keys) - 1):
        out[sorted_keys[i]] = []
        for j in range(len(sorted_keys) - 1 ):
            co_occ = sum([a * b for a, b in zip(co_occ_matrix[sorted_keys[i]], co_occ_matrix[sorted_keys[j]])])
            if co_occ > 0:
-                out[sorted_keys[i]].append((sorted_keys[j], co_occ))
-    print(out)
+                out[sorted_keys[i]].append([sorted_keys[j]])
+
+    def min_distance(text, w1, w2):
+        index1 = None
+        index2 = None
+        distance = 1000000
+        for idx, word in enumerate(text[0].split(" ")):
+            if word == w1:
+                if index2 is not None:
+                    distance = min(distance, abs(idx - index2) - 1)
+                index1 = idx
+            if word == w2:
+                if index1 is not None:
+                    distance = min(distance, abs(idx - index1) - 1)
+                index2 = idx
+        if index1 is not None and index2 is not None:
+            return distance
+        return -1
+
+    for k, v in out.items():
+        for i in range(len(v)):
+            if set(co_occ_dict[k]).intersection(co_occ_dict[v[i][0]]):
+                iterator = iter(set(co_occ_dict[k]).intersection(co_occ_dict[v[i][0]]))
+                v[i].append([])
+                for r in range(len(set(co_occ_dict[k]).intersection(co_occ_dict[v[i][0]]))):
+                    matching_file = next(iterator, None)
+                    with open('./corpus/' + str(matching_file)) as f:
+                        text = f.readlines()
+                        min_dist = min_distance(text, k, v[i][0])
+                        v[i][1].append(min_dist)
+
+    for k, v in out.items():
+        to_remove = []
+        for i in range(len(v)):
+            if v[i][1][0] == -1:
+                to_remove.append(i)
+        for index in sorted(to_remove, reverse=True):
+            del v[index]
+
+    def Convert(a):
+        it = iter(a)
+        dct = dict(zip(it, it))
+        return dct
+
+    for k, v in out.items():
+        for i in range(len(v)):
+            #v[i] = dict.fromkeys(v[i], v[i][1])
+            v[i][1] = sorted(v[i][1])
+            v[i] = Convert(v[i])
+
    with open('elements_co_occ.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
        w = csv.DictWriter(f, out.keys())
        w.writeheader()
@@ -153,8 +199,6 @@ if __name__ == '__main__':
        json.dump(out, f)
    result = json.dumps(out)

-    print(result)
-
No results found