Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90752482
bibclassify_keyword_analyzer.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Nov 4, 10:53
Size
9 KB
Mime Type
text/x-python
Expires
Wed, Nov 6, 10:53 (2 d)
Engine
blob
Format
Raw Data
Handle
22129569
Attached To
R3600 invenio-infoscience
bibclassify_keyword_analyzer.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2008, 2009, 2010, 2011, CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibClassify keyword analyser.
This module contains methods to extract keywords from texts. It provides 3
different methods for 3 different types of keywords: single keywords, composite
keywords and author keywords.
"""
import
re
import
sys
import
time
try
:
from
bibclassify_config
import
CFG_BIBCLASSIFY_VALID_SEPARATORS
,
\
CFG_BIBCLASSIFY_AUTHOR_KW_START
,
\
CFG_BIBCLASSIFY_AUTHOR_KW_END
,
\
CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION
from
bibclassify_utils
import
write_message
except
ImportError
,
err
:
print
>>
sys
.
stderr
,
"Error:
%s
"
%
err
sys
.
exit
(
1
)
# Retrieve the custom configuration if it exists.
try
:
from
bibclassify_config_local
import
*
except
ImportError
:
# No local configuration was found.
pass
_MAXIMUM_SEPARATOR_LENGTH
=
max
([
len
(
_separator
)
for
_separator
in
CFG_BIBCLASSIFY_VALID_SEPARATORS
])
def
get_single_keywords
(
skw_db
,
fulltext
,
verbose
=
True
):
"""Returns a dictionary of single keywords bound with the positions
of the matches in the fulltext.
Format of the output dictionary is (single keyword: positions)."""
timer_start
=
time
.
clock
()
# Matched span -> single keyword
records
=
[]
for
single_keyword
in
skw_db
:
for
regex
in
single_keyword
.
regex
:
for
match
in
regex
.
finditer
(
fulltext
):
# Modify the right index to put it on the last letter
# of the word.
span
=
(
match
.
span
()[
0
],
match
.
span
()[
1
]
-
1
)
# Remove the previous records contained by this span
records
=
[
record
for
record
in
records
if
not
_contains_span
(
span
,
record
[
0
])]
add
=
True
for
previous_record
in
records
:
if
((
span
,
single_keyword
)
==
previous_record
or
_contains_span
(
previous_record
[
0
],
span
)):
# Match is contained by a previous match.
add
=
False
break
if
add
:
records
.
append
((
span
,
single_keyword
))
# List of single_keywords: {spans: single keyword}
single_keywords
=
{}
for
span
,
single_keyword
in
records
:
single_keywords
.
setdefault
(
single_keyword
,
[])
.
append
(
span
)
if
verbose
:
write_message
(
"INFO: Matching single keywords...
%d
keywords found "
"in
%.1f
sec."
%
(
len
(
single_keywords
),
time
.
clock
()
-
timer_start
),
stream
=
sys
.
stderr
,
verbose
=
3
)
return
single_keywords
def
get_composite_keywords
(
ckw_db
,
fulltext
,
skw_spans
,
verbose
=
True
):
"""Returns a list of composite keywords bound with the number of
occurrences found in the text string.
Format of the output list is (composite keyword, count, component counts)."""
timer_start
=
time
.
clock
()
# Build the list of composite candidates
ckw_list
=
[]
skw_as_components
=
[]
for
composite_keyword
in
ckw_db
:
# Counters for the composite keyword. First count is for the
# number of occurrences in the whole document and second count
# is for the human defined keywords.
ckw_count
=
0
matched_spans
=
[]
# Check the alternative labels.
for
regex
in
composite_keyword
.
regex
:
for
match
in
regex
.
finditer
(
fulltext
):
span
=
list
(
match
.
span
())
span
[
1
]
-=
1
span
=
tuple
(
span
)
if
not
span
in
matched_spans
:
ckw_count
+=
1
matched_spans
.
append
(
span
)
# Get the single keywords locations.
try
:
components
=
composite_keyword
.
compositeof
except
AttributeError
:
print
>>
sys
.
stderr
,
(
"Cached ontology is corrupted. Please "
"remove the cached ontology in your temporary file."
)
sys
.
exit
(
1
)
try
:
spans
=
[
skw_spans
[
component
]
for
component
in
components
]
except
KeyError
:
# The keyword components are not to be found in the text.
# This is not a dramatic exception and we can safely ignore
# it.
pass
else
:
ckw_spans
=
[]
for
index
in
range
(
len
(
spans
)
-
1
):
if
ckw_spans
:
previous_spans
=
ckw_spans
else
:
previous_spans
=
spans
[
index
]
ckw_spans
=
[]
for
new_span
in
[(
span0
,
span1
)
for
span0
in
previous_spans
for
span1
in
spans
[
index
+
1
]]:
span
=
_get_ckw_span
(
fulltext
,
new_span
)
if
span
is
not
None
:
ckw_spans
.
append
(
span
)
for
span
in
[
span
for
span
in
ckw_spans
if
not
span
in
matched_spans
]:
ckw_count
+=
1
matched_spans
.
append
(
span
)
if
ckw_count
:
# Gather the component counts.
component_counts
=
[]
for
component
in
components
:
skw_as_components
.
append
(
component
)
# Get the single keyword count.
try
:
component_counts
.
append
(
len
(
skw_spans
[
component
]))
except
KeyError
:
component_counts
.
append
(
0
)
# Store the composite keyword
ckw_list
.
append
((
composite_keyword
,
ckw_count
,
component_counts
))
# Remove the single keywords that appear as components from the list
# of single keywords.
for
skw
in
skw_as_components
:
try
:
del
skw_spans
[
skw
]
except
KeyError
:
pass
if
verbose
:
write_message
(
"INFO: Matching composite keywords...
%d
keywords found "
"in
%.1f
sec."
%
(
len
(
ckw_list
),
time
.
clock
()
-
timer_start
),
stream
=
sys
.
stderr
,
verbose
=
3
)
return
ckw_list
def
get_author_keywords
(
skw_db
,
ckw_db
,
fulltext
):
"""Finds out human defined keyowrds in a text string. Searches for
the string "Keywords:" and its declinations and matches the
following words."""
timer_start
=
time
.
clock
()
split_string
=
CFG_BIBCLASSIFY_AUTHOR_KW_START
.
split
(
fulltext
,
1
)
if
len
(
split_string
)
==
1
:
write_message
(
"INFO: Matching author keywords... No keywords found."
,
stream
=
sys
.
stderr
,
verbose
=
3
)
return
None
kw_string
=
split_string
[
1
]
for
regex
in
CFG_BIBCLASSIFY_AUTHOR_KW_END
:
parts
=
regex
.
split
(
kw_string
,
1
)
kw_string
=
parts
[
0
]
# We separate the keywords.
author_keywords
=
CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION
.
split
(
kw_string
)
write_message
(
"INFO: Matching author keywords...
%d
keywords found in "
"
%.1f
sec."
%
(
len
(
author_keywords
),
time
.
clock
()
-
timer_start
),
stream
=
sys
.
stderr
,
verbose
=
3
)
out
=
{}
for
kw
in
author_keywords
:
# If the author keyword is an acronym with capital letters
# separated by points, remove the points.
if
re
.
match
(
'([A-Z].)+$'
,
kw
):
kw
=
kw
.
replace
(
'.'
,
''
)
# First try with the keyword as such, then lower it.
kw_with_spaces
=
'
%s
'
%
kw
matching_skw
=
get_single_keywords
(
skw_db
,
kw_with_spaces
,
verbose
=
False
)
matching_ckw
=
get_composite_keywords
(
ckw_db
,
kw_with_spaces
,
matching_skw
,
verbose
=
False
)
if
matching_skw
or
matching_ckw
:
out
[
kw
]
=
(
matching_skw
,
matching_ckw
)
continue
lowkw
=
kw
.
lower
()
matching_skw
=
get_single_keywords
(
skw_db
,
'
%s
'
%
lowkw
,
verbose
=
False
)
matching_ckw
=
get_composite_keywords
(
ckw_db
,
'
%s
'
%
lowkw
,
matching_skw
,
verbose
=
False
)
out
[
kw
]
=
(
matching_skw
,
matching_ckw
)
return
out
def
_get_ckw_span
(
fulltext
,
spans
):
"""Returns the span of the composite keyword if it is valid. Returns
None otherwise."""
if
spans
[
0
]
<
spans
[
1
]:
words
=
(
spans
[
0
],
spans
[
1
])
dist
=
spans
[
1
][
0
]
-
spans
[
0
][
1
]
else
:
words
=
(
spans
[
1
],
spans
[
0
])
dist
=
spans
[
0
][
0
]
-
spans
[
1
][
1
]
if
dist
==
0
:
# Two keywords are adjacent. We have a match.
return
(
min
(
words
[
0
]
+
words
[
1
]),
max
(
words
[
0
]
+
words
[
1
]))
elif
dist
<=
_MAXIMUM_SEPARATOR_LENGTH
:
separator
=
fulltext
[
words
[
0
][
1
]:
words
[
1
][
0
]
+
1
]
# Check the separator.
if
separator
.
strip
()
in
CFG_BIBCLASSIFY_VALID_SEPARATORS
:
return
(
min
(
words
[
0
]
+
words
[
1
]),
max
(
words
[
0
]
+
words
[
1
]))
# There is no inclusion.
return
None
def
_contains_span
(
span0
,
span1
):
"""Return true if span0 contains span1, False otherwise."""
if
(
span0
==
span1
or
span0
[
0
]
>
span1
[
0
]
or
span0
[
1
]
<
span1
[
1
]):
return
False
return
True
Event Timeline
Log In to Comment