Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
M
material-science-word-embeddings
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
David Schwab
material-science-word-embeddings
Compare revisions
d5648410a01f5955a642b14d0fca01f42eee9c86 to f37a246d83a24bfde125524d3c6a6c5be0f258c6
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
davidschwab/material-science-word-embeddings
Select target project
No results found
f37a246d83a24bfde125524d3c6a6c5be0f258c6
Select Git revision
Branches
main
Swap
Target
davidschwab/material-science-word-embeddings
Select target project
davidschwab/material-science-word-embeddings
1 result
d5648410a01f5955a642b14d0fca01f42eee9c86
Select Git revision
Branches
main
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
#20
· c4b489f0
Marie Kl
authored
2 years ago
json of cooccurences including distances of elements
c4b489f0
Merge remote-tracking branch 'origin/main'
· f37a246d
Marie Kl
authored
2 years ago
f37a246d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
co_occurences.json
+1
-1
1 addition, 1 deletion
co_occurences.json
get_compounds.py
+51
-7
51 additions, 7 deletions
get_compounds.py
with
52 additions
and
8 deletions
co_occurences.json
View file @
f37a246d
Source diff could not be displayed: it is too large. Options to address this:
view the blob
.
This diff is collapsed.
Click to expand it.
get_compounds.py
View file @
f37a246d
...
...
@@ -85,7 +85,6 @@ if __name__ == '__main__':
elements_counts
=
Counter
(
elements
)
formulas_counts
=
Counter
(
formulas
)
regex_formulas_counts
=
Counter
(
regex_formulas
)
#print(pd.Series(elements).value_counts())
# create df with element and frequency from Counter
#df = pd.DataFrame.from_dict(elements_counts, orient='index').reset_index()
#df = df.rename(columns={"index": "element", 0: "count_per_file"})
...
...
@@ -109,7 +108,6 @@ if __name__ == '__main__':
except
Exception
as
e
:
print
(
e
)
# calculate total count of element
# TODO: divide to seperate df for element and formula
result
.
to_csv
(
"
elements_result.csv
"
,
index
=
False
)
result2
.
to_csv
(
"
formulas_result.csv
"
,
index
=
False
)
result3
.
to_csv
(
"
regex_formulas_result.csv
"
,
index
=
False
)
...
...
@@ -136,14 +134,62 @@ if __name__ == '__main__':
out
=
{}
# keep the keys in sorted order
sorted_keys
=
sorted
(
co_occ_matrix
)
# now for each key in the list
for
i
in
range
(
len
(
sorted_keys
)
-
1
):
out
[
sorted_keys
[
i
]]
=
[]
for
j
in
range
(
len
(
sorted_keys
)
-
1
):
co_occ
=
sum
([
a
*
b
for
a
,
b
in
zip
(
co_occ_matrix
[
sorted_keys
[
i
]],
co_occ_matrix
[
sorted_keys
[
j
]])])
if
co_occ
>
0
:
out
[
sorted_keys
[
i
]].
append
((
sorted_keys
[
j
],
co_occ
))
print
(
out
)
out
[
sorted_keys
[
i
]].
append
([
sorted_keys
[
j
]])
def
min_distance
(
text
,
w1
,
w2
):
index1
=
None
index2
=
None
distance
=
1000000
for
idx
,
word
in
enumerate
(
text
[
0
].
split
(
"
"
)):
if
word
==
w1
:
if
index2
is
not
None
:
distance
=
min
(
distance
,
abs
(
idx
-
index2
)
-
1
)
index1
=
idx
if
word
==
w2
:
if
index1
is
not
None
:
distance
=
min
(
distance
,
abs
(
idx
-
index1
)
-
1
)
index2
=
idx
if
index1
is
not
None
and
index2
is
not
None
:
return
distance
return
-
1
for
k
,
v
in
out
.
items
():
for
i
in
range
(
len
(
v
)):
if
set
(
co_occ_dict
[
k
]).
intersection
(
co_occ_dict
[
v
[
i
][
0
]]):
iterator
=
iter
(
set
(
co_occ_dict
[
k
]).
intersection
(
co_occ_dict
[
v
[
i
][
0
]]))
v
[
i
].
append
([])
for
r
in
range
(
len
(
set
(
co_occ_dict
[
k
]).
intersection
(
co_occ_dict
[
v
[
i
][
0
]]))):
matching_file
=
next
(
iterator
,
None
)
with
open
(
'
./corpus/
'
+
str
(
matching_file
))
as
f
:
text
=
f
.
readlines
()
min_dist
=
min_distance
(
text
,
k
,
v
[
i
][
0
])
v
[
i
][
1
].
append
(
min_dist
)
for
k
,
v
in
out
.
items
():
to_remove
=
[]
for
i
in
range
(
len
(
v
)):
if
v
[
i
][
1
][
0
]
==
-
1
:
to_remove
.
append
(
i
)
for
index
in
sorted
(
to_remove
,
reverse
=
True
):
del
v
[
index
]
def
Convert
(
a
):
it
=
iter
(
a
)
dct
=
dict
(
zip
(
it
,
it
))
return
dct
for
k
,
v
in
out
.
items
():
for
i
in
range
(
len
(
v
)):
#v[i] = dict.fromkeys(v[i], v[i][1])
v
[
i
][
1
]
=
sorted
(
v
[
i
][
1
])
v
[
i
]
=
Convert
(
v
[
i
])
with
open
(
'
elements_co_occ.csv
'
,
'
w
'
)
as
f
:
# You will need 'wb' mode in Python 2.x
w
=
csv
.
DictWriter
(
f
,
out
.
keys
())
w
.
writeheader
()
...
...
@@ -153,8 +199,6 @@ if __name__ == '__main__':
json
.
dump
(
out
,
f
)
result
=
json
.
dumps
(
out
)
print
(
result
)
This diff is collapsed.
Click to expand it.