Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91122283
combine-entity-results.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Nov 8, 03:36
Size
1 KB
Mime Type
text/x-python
Expires
Sun, Nov 10, 03:36 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22200847
Attached To
R1473 warcbase
combine-entity-results.py
View Options
#!/usr/bin/env python
import
os
,
sys
import
fnmatch
import
gzip
import
re
part_pattern
=
'part-*.gz'
def
combine
(
input_dir
,
output_dir
):
part_file_names
=
fnmatch
.
filter
(
os
.
listdir
(
input_dir
),
part_pattern
)
try
:
pers_f
=
open
(
os
.
path
.
join
(
output_dir
,
'pers.txt'
),
mode
=
'w'
)
org_f
=
open
(
os
.
path
.
join
(
output_dir
,
'org.txt'
),
mode
=
'w'
)
loc_f
=
open
(
os
.
path
.
join
(
output_dir
,
'loc.txt'
),
mode
=
'w'
)
for
p
in
part_file_names
:
part_f
=
gzip
.
open
(
os
.
path
.
join
(
input_dir
,
p
),
mode
=
'r'
)
lines
=
part_f
.
readlines
()
part_f
.
close
()
for
line
in
lines
:
m
=
re
.
search
(
'{PERSON=\[(.*)\], ORGANIZATION='
\
'\[(.*)\], LOCATION=\[(.*)\]}'
,
line
)
if
m
:
for
(
f
,
i
)
in
[(
pers_f
,
1
),
(
org_f
,
2
),
(
loc_f
,
3
)]:
if
m
.
group
(
i
):
f
.
write
(
m
.
group
(
i
)
.
replace
(
', '
,
'
\n
'
)
+
'
\n
'
)
else
:
print
"Error: Expected matching entity string"
pers_f
.
close
()
org_f
.
close
()
loc_f
.
close
()
except
IOError
as
e
:
print
"Operation failed:
%s
"
%
e
.
strerror
return
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
>=
2
:
input_dir
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
==
3
:
output_dir
=
sys
.
argv
[
2
]
else
:
output_dir
=
input_dir
if
os
.
path
.
isdir
(
input_dir
)
and
os
.
path
.
isdir
(
output_dir
):
combine
(
input_dir
,
output_dir
)
else
:
print
"Invalid directory name(s)"
else
:
print
"Usage:
%s
<dir containing
%s
files> [output dir]"
%
(
sys
.
argv
[
0
],
part_pattern
)
Event Timeline
Log In to Comment