Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102741002
cooc.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Feb 23, 17:14
Size
1 KB
Mime Type
text/x-python
Expires
Tue, Feb 25, 17:14 (2 d)
Engine
blob
Format
Raw Data
Handle
24417048
Attached To
rTZUCT ML_Project1
cooc.py
View Options
#!/usr/bin/env python3
from
scipy.sparse
import
*
import
numpy
as
np
import
pickle
def
main
():
with
open
(
'vocab.pkl'
,
'rb'
)
as
f
:
vocab
=
pickle
.
load
(
f
)
vocab_size
=
len
(
vocab
)
data
,
row
,
col
=
[],
[],
[]
counter
=
1
for
fn
in
[
'pre_train_pos_full.txt'
,
'pre_train_neg_full.txt'
]:
with
open
(
fn
)
as
f
:
for
line
in
f
:
tokens
=
[
vocab
.
get
(
t
,
-
1
)
for
t
in
line
.
strip
()
.
split
()]
tokens
=
[
t
for
t
in
tokens
if
t
>=
0
]
for
t
in
tokens
:
for
t2
in
tokens
:
data
.
append
(
1
)
row
.
append
(
t
)
col
.
append
(
t2
)
if
counter
%
10000
==
0
:
print
(
counter
)
counter
+=
1
cooc
=
coo_matrix
((
data
,
(
row
,
col
)))
print
(
"summing duplicates (this can take a while)"
)
cooc
.
sum_duplicates
()
with
open
(
'cooc.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
cooc
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
if
__name__
==
'__main__'
:
main
()
Event Timeline
Log In to Comment