Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F121629820
cooc.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 12, 15:27
Size
1 KB
Mime Type
text/x-python
Expires
Mon, Jul 14, 15:27 (2 d)
Engine
blob
Format
Raw Data
Handle
27362598
Attached To
rTZUCT ML_Project1
cooc.py
View Options
#!/usr/bin/env python3
from
scipy.sparse
import
*
import
numpy
as
np
import
pickle
def
main
():
with
open
(
'vocab.pkl'
,
'rb'
)
as
f
:
vocab
=
pickle
.
load
(
f
)
vocab_size
=
len
(
vocab
)
data
,
row
,
col
=
[],
[],
[]
counter
=
1
for
fn
in
[
'pre_train_pos_full.txt'
,
'pre_train_neg_full.txt'
]:
with
open
(
fn
)
as
f
:
for
line
in
f
:
tokens
=
[
vocab
.
get
(
t
,
-
1
)
for
t
in
line
.
strip
()
.
split
()]
tokens
=
[
t
for
t
in
tokens
if
t
>=
0
]
for
t
in
tokens
:
for
t2
in
tokens
:
data
.
append
(
1
)
row
.
append
(
t
)
col
.
append
(
t2
)
if
counter
%
10000
==
0
:
print
(
counter
)
counter
+=
1
cooc
=
coo_matrix
((
data
,
(
row
,
col
)))
print
(
"summing duplicates (this can take a while)"
)
cooc
.
sum_duplicates
()
with
open
(
'cooc.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
cooc
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
if
__name__
==
'__main__'
:
main
()
Event Timeline
Log In to Comment