Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F81699793
engine.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Sep 7, 12:16
Size
29 KB
Mime Type
text/x-python
Expires
Mon, Sep 9, 12:16 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
20607718
Attached To
R3600 invenio-infoscience
engine.py
View Options
## This file is part of Invenio.
## Copyright (C) 2007, 2008, 2009, 2010, 2011, 2013, 2014 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibClassify engine.
This module is the main module of BibClassify. its two main methods are
output_keywords_for_sources and get_keywords_from_text. The first one output
keywords for a list of sources (local files or URLs, PDF or text) while the
second one outputs the keywords for text lines (which are obtained using the
module bibclassify_text_normalizer).
This module also takes care of the different outputs (text, MARCXML or HTML).
But unfortunately there is a confusion between running in a standalone mode
and producing output suitable for printing, and running in a web-based
mode where the webtemplate is used. For the moment the pieces of the representation
code are left in this module.
This module is STANDALONE safe
"""
import
os
import
time
import
config
as
bconfig
log
=
bconfig
.
get_logger
(
"bibclassify.engine"
)
from
invenio.legacy.bibclassify
import
ontology_reader
as
reader
import
text_extractor
as
extractor
import
text_normalizer
as
normalizer
import
keyword_analyzer
as
keyworder
import
acronym_analyzer
as
acronymer
try
:
from
invenio.utils.url
import
make_user_agent_string
except
ImportError
:
## Not in Invenio, we simply use default agent
def
make_user_agent_string
(
component
=
None
):
return
bconfig
.
CFG_BIBCLASSIFY_USER_AGENT
try
:
from
invenio.utils.text
import
encode_for_xml
except
ImportError
:
## Not in Invenio, we use a simple workaround
encode_for_xml
=
lambda
text
:
text
.
replace
(
'&'
,
'&'
)
.
replace
(
'<'
,
'<'
)
# ---------------------------------------------------------------------
# API
# ---------------------------------------------------------------------
def
output_keywords_for_sources
(
input_sources
,
taxonomy_name
,
output_mode
=
"text"
,
output_limit
=
bconfig
.
CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER
,
spires
=
False
,
match_mode
=
"full"
,
no_cache
=
False
,
with_author_keywords
=
False
,
rebuild_cache
=
False
,
only_core_tags
=
False
,
extract_acronyms
=
False
,
api
=
False
,
**
kwargs
):
"""Outputs the keywords for each source in sources."""
# Inner function which does the job and it would be too much work to
# refactor the call (and it must be outside the loop, before it did
# not process multiple files)
def
process_lines
():
if
output_mode
==
"text"
:
print
"Input file:
%s
"
%
source
output
=
get_keywords_from_text
(
text_lines
,
taxonomy_name
,
output_mode
=
output_mode
,
output_limit
=
output_limit
,
spires
=
spires
,
match_mode
=
match_mode
,
no_cache
=
no_cache
,
with_author_keywords
=
with_author_keywords
,
rebuild_cache
=
rebuild_cache
,
only_core_tags
=
only_core_tags
,
extract_acronyms
=
extract_acronyms
)
if
api
:
return
output
else
:
if
isinstance
(
output
,
dict
):
for
i
in
output
:
print
output
[
i
]
# Get the fulltext for each source.
for
entry
in
input_sources
:
log
.
info
(
"Trying to read input file
%s
."
%
entry
)
text_lines
=
None
source
=
""
if
os
.
path
.
isdir
(
entry
):
for
filename
in
os
.
listdir
(
entry
):
filename
=
os
.
path
.
join
(
entry
,
filename
)
if
os
.
path
.
isfile
(
filename
):
text_lines
=
extractor
.
text_lines_from_local_file
(
filename
)
if
text_lines
:
source
=
filename
process_lines
()
elif
os
.
path
.
isfile
(
entry
):
text_lines
=
extractor
.
text_lines_from_local_file
(
entry
)
if
text_lines
:
source
=
os
.
path
.
basename
(
entry
)
process_lines
()
else
:
# Treat as a URL.
text_lines
=
extractor
.
text_lines_from_url
(
entry
,
user_agent
=
make_user_agent_string
(
"BibClassify"
))
if
text_lines
:
source
=
entry
.
split
(
"/"
)[
-
1
]
process_lines
()
def
get_keywords_from_local_file
(
local_file
,
taxonomy_name
,
output_mode
=
"text"
,
output_limit
=
bconfig
.
CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER
,
spires
=
False
,
match_mode
=
"full"
,
no_cache
=
False
,
with_author_keywords
=
False
,
rebuild_cache
=
False
,
only_core_tags
=
False
,
extract_acronyms
=
False
,
api
=
False
,
**
kwargs
):
"""Outputs keywords reading a local file. Arguments and output are the same
as for @see: get_keywords_from_text() """
log
.
info
(
"Analyzing keywords for local file
%s
."
%
local_file
)
text_lines
=
extractor
.
text_lines_from_local_file
(
local_file
)
return
get_keywords_from_text
(
text_lines
,
taxonomy_name
,
output_mode
=
output_mode
,
output_limit
=
output_limit
,
spires
=
spires
,
match_mode
=
match_mode
,
no_cache
=
no_cache
,
with_author_keywords
=
with_author_keywords
,
rebuild_cache
=
rebuild_cache
,
only_core_tags
=
only_core_tags
,
extract_acronyms
=
extract_acronyms
)
def
get_keywords_from_text
(
text_lines
,
taxonomy_name
,
output_mode
=
"text"
,
output_limit
=
bconfig
.
CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER
,
spires
=
False
,
match_mode
=
"full"
,
no_cache
=
False
,
with_author_keywords
=
False
,
rebuild_cache
=
False
,
only_core_tags
=
False
,
extract_acronyms
=
False
,
**
kwargs
):
"""Extracts keywords from the list of strings
@var text_lines: list of strings (will be normalized before being
joined into one string)
@keyword taxonomy_name: string, name of the taxonomy_name
@keyword output_mode: string - text|html|marcxml|raw
@keyword output_limit: int
@keyword spires: boolean, if True marcxml output reflect spires
codes
@keyword match_mode: str - partial|full; in partial mode only
beginning of the fulltext is searched
@keyword no_cache: boolean, means loaded definitions will not be saved
@keyword with_author_keywords: boolean, extract keywords from the
pdfs
@keyword rebuild_cache: boolean
@keyword only_core_tags: boolean
@return: if output_mode=raw, it will return
(single_keywords, composite_keywords, author_keywords, acronyms)
for other output modes it returns formatted string
"""
cache
=
reader
.
get_cache
(
taxonomy_name
)
if
not
cache
:
reader
.
set_cache
(
taxonomy_name
,
reader
.
get_regular_expressions
(
taxonomy_name
,
rebuild
=
rebuild_cache
,
no_cache
=
no_cache
))
cache
=
reader
.
get_cache
(
taxonomy_name
)
_skw
=
cache
[
0
]
_ckw
=
cache
[
1
]
text_lines
=
normalizer
.
cut_references
(
text_lines
)
fulltext
=
normalizer
.
normalize_fulltext
(
"
\n
"
.
join
(
text_lines
))
if
match_mode
==
"partial"
:
fulltext
=
_get_partial_text
(
fulltext
)
author_keywords
=
None
if
with_author_keywords
:
author_keywords
=
extract_author_keywords
(
_skw
,
_ckw
,
fulltext
)
acronyms
=
{}
if
extract_acronyms
:
acronyms
=
extract_abbreviations
(
fulltext
)
single_keywords
=
extract_single_keywords
(
_skw
,
fulltext
)
composite_keywords
=
extract_composite_keywords
(
_ckw
,
fulltext
,
single_keywords
)
if
only_core_tags
:
single_keywords
=
clean_before_output
(
_filter_core_keywors
(
single_keywords
))
composite_keywords
=
_filter_core_keywors
(
composite_keywords
)
else
:
# Filter out the "nonstandalone" keywords
single_keywords
=
clean_before_output
(
single_keywords
)
return
get_keywords_output
(
single_keywords
,
composite_keywords
,
taxonomy_name
,
author_keywords
,
acronyms
,
output_mode
,
output_limit
,
spires
,
only_core_tags
)
def
extract_single_keywords
(
skw_db
,
fulltext
):
"""Find single keywords in the fulltext
@var skw_db: list of KeywordToken objects
@var fulltext: string, which will be searched
@return : dictionary of matches in a format {
<keyword object>, [[position, position...], ],
..
}
or empty {}
"""
return
keyworder
.
get_single_keywords
(
skw_db
,
fulltext
)
or
{}
def
extract_composite_keywords
(
ckw_db
,
fulltext
,
skw_spans
):
"""Returns a list of composite keywords bound with the number of
occurrences found in the text string.
@var ckw_db: list of KewordToken objects (they are supposed to be composite ones)
@var fulltext: string to search in
@skw_spans: dictionary of already identified single keywords
@return : dictionary of matches in a format {
<keyword object>, [[position, position...], [info_about_matches] ],
..
}
or empty {}
"""
return
keyworder
.
get_composite_keywords
(
ckw_db
,
fulltext
,
skw_spans
)
or
{}
def
extract_abbreviations
(
fulltext
):
"""Extract acronyms from the fulltext
@var fulltext: utf-8 string
@return: dictionary of matches in a formt {
<keyword object>, [matched skw or ckw object, ....]
}
or empty {}
"""
acronyms
=
{}
K
=
reader
.
KeywordToken
for
k
,
v
in
acronymer
.
get_acronyms
(
fulltext
)
.
items
():
acronyms
[
K
(
k
,
type
=
'acronym'
)]
=
v
return
acronyms
def
extract_author_keywords
(
skw_db
,
ckw_db
,
fulltext
):
"""Finds out human defined keyowrds in a text string. Searches for
the string "Keywords:" and its declinations and matches the
following words.
@var skw_db: list single kw object
@var ckw_db: list of composite kw objects
@var fulltext: utf-8 string
@return: dictionary of matches in a formt {
<keyword object>, [matched skw or ckw object, ....]
}
or empty {}
"""
akw
=
{}
K
=
reader
.
KeywordToken
for
k
,
v
in
keyworder
.
get_author_keywords
(
skw_db
,
ckw_db
,
fulltext
)
.
items
():
akw
[
K
(
k
,
type
=
'author-kw'
)]
=
v
return
akw
# ---------------------------------------------------------------------
# presentation functions
# ---------------------------------------------------------------------
def
get_keywords_output
(
single_keywords
,
composite_keywords
,
taxonomy_name
,
author_keywords
=
None
,
acronyms
=
None
,
style
=
"text"
,
output_limit
=
0
,
spires
=
False
,
only_core_tags
=
False
):
"""Returns a formatted string representing the keywords according
to the chosen style. This is the main routing call, this function will
also strip unwanted keywords before output and limits the number
of returned keywords
@var single_keywords: list of single keywords
@var composite_keywords: list of composite keywords
@var taxonomy_name: string, taxonomy name
@keyword author_keywords: dictionary of author keywords extracted from fulltext
@keyword acronyms: dictionary of extracted acronyms
@keyword style: text|html|marc
@keyword output_limit: int, number of maximum keywords printed (it applies
to single and composite keywords separately)
@keyword spires: boolen meaning spires output style
@keyword only_core_tags: boolean
"""
# sort the keywords, but don't limit them (that will be done later)
single_keywords_p
=
_sort_kw_matches
(
single_keywords
)
composite_keywords_p
=
_sort_kw_matches
(
composite_keywords
)
try
:
p_html
=
style
.
index
(
"html"
)
p_text
=
style
.
index
(
"text"
)
if
p_text
>
p_html
:
style
[
p_html
]
=
"text"
style
[
p_text
]
=
"html"
except
:
#stupide except pass because index return an exception if
#the element is not present in the list
pass
my_styles
=
{}
for
s
in
style
:
if
s
==
"text"
:
my_styles
[
"text"
]
=
_output_text
(
single_keywords_p
,
composite_keywords_p
,
author_keywords
,
acronyms
,
spires
,
only_core_tags
,
limit
=
output_limit
)
print
my_styles
[
"text"
]
elif
s
==
"marcxml"
:
my_styles
[
"marcxml"
]
=
_output_marc
(
single_keywords_p
,
composite_keywords_p
,
author_keywords
,
acronyms
)
elif
s
==
"html"
:
if
"text"
in
my_styles
:
my_styles
[
"html"
]
=
_output_html
(
single_keywords_p
,
composite_keywords_p
,
author_keywords
,
acronyms
,
spires
,
taxonomy_name
,
limit
=
output_limit
,
output
=
my_styles
[
"text"
])
else
:
test_output
=
_output_text
(
single_keywords_p
,
composite_keywords_p
,
author_keywords
,
acronyms
,
spires
,
only_core_tags
,
limit
=
output_limit
)
my_styles
[
"html"
]
=
_output_html
(
single_keywords
,
composite_keywords
,
author_keywords
,
acronyms
,
spires
,
taxonomy_name
,
limit
=
output_limit
,
output
=
test_output
)
elif
s
==
"raw"
:
if
output_limit
>
0
:
my_styles
[
"raw"
]
=
(
_kw
(
_sort_kw_matches
(
single_keywords
,
output_limit
)),
_kw
(
_sort_kw_matches
(
composite_keywords
,
output_limit
)),
author_keywords
,
# this we don't limit (?)
_kw
(
_sort_kw_matches
(
acronyms
,
output_limit
)))
else
:
my_styles
[
"raw"
]
=
(
single_keywords_p
,
composite_keywords_p
,
author_keywords
,
acronyms
)
elif
s
==
"dict"
:
my_styles
[
"dict"
]
=
_output_dict
(
single_keywords_p
,
composite_keywords_p
,
author_keywords
,
acronyms
)
return
my_styles
def
build_marc
(
recid
,
single_keywords
,
composite_keywords
,
spires
=
False
,
author_keywords
=
None
,
acronyms
=
None
):
"""Creates xml record
@recid: ingeter
@var single_keywords: dictionary of kws
@var composite_keywords: dictionary of kws
@keyword spires: please don't use, left for historical
reasons
@keyword author_keywords: dictionary of extracted keywords
@keyword acronyms: dictionary of extracted acronyms
@return: str, marxml
"""
output
=
[
'<collection><record>
\n
'
'<controlfield tag="001">
%s
</controlfield>'
%
recid
]
# no need to sort
single_keywords
=
single_keywords
.
items
()
composite_keywords
=
composite_keywords
.
items
()
output
.
append
(
_output_marc
(
single_keywords
,
composite_keywords
,
author_keywords
,
acronyms
))
output
.
append
(
'</record></collection>'
)
return
'
\n
'
.
join
(
output
)
def
_output_marc
(
skw_matches
,
ckw_matches
,
author_keywords
,
acronyms
,
spires
=
False
,
kw_field
=
bconfig
.
CFG_MAIN_FIELD
,
auth_field
=
bconfig
.
CFG_AUTH_FIELD
,
acro_field
=
bconfig
.
CFG_ACRON_FIELD
,
provenience
=
'BibClassify'
):
"""Outputs the keywords in the MARCXML format.
@var skw_matches: list of single keywords
@var ckw_matches: list of composite keywords
@var author_keywords: dictionary of extracted author keywords
@var acronyms: dictionary of acronyms
@var spires: boolean, True=generate spires output - BUT NOTE: it is
here only not to break compatibility, in fact spires output
should never be used for xml because if we read marc back
into the KeywordToken objects, we would not find them
@keyword provenience: string that identifies source (authority) that
assigned the contents of the field
@return: string, formatted MARC"""
kw_template
=
(
'<datafield tag="
%s
" ind1="
%s
" ind2="
%s
">
\n
'
' <subfield code="2">
%s
</subfield>
\n
'
' <subfield code="a">
%s
</subfield>
\n
'
' <subfield code="n">
%s
</subfield>
\n
'
' <subfield code="9">
%s
</subfield>
\n
'
'</datafield>
\n
'
)
output
=
[]
tag
,
ind1
,
ind2
=
_parse_marc_code
(
kw_field
)
for
keywords
in
(
skw_matches
,
ckw_matches
):
if
keywords
and
len
(
keywords
):
for
kw
,
info
in
keywords
:
output
.
append
(
kw_template
%
(
tag
,
ind1
,
ind2
,
encode_for_xml
(
provenience
),
encode_for_xml
(
kw
.
output
(
spires
)),
len
(
info
[
0
]),
encode_for_xml
(
kw
.
getType
())))
for
field
,
keywords
in
((
auth_field
,
author_keywords
),
(
acro_field
,
acronyms
)):
if
keywords
and
len
(
keywords
)
and
field
:
# field='' we shall not save the keywords
tag
,
ind1
,
ind2
=
_parse_marc_code
(
field
)
for
kw
,
info
in
keywords
.
items
():
output
.
append
(
kw_template
%
(
tag
,
ind1
,
ind2
,
encode_for_xml
(
provenience
),
encode_for_xml
(
kw
),
''
,
encode_for_xml
(
kw
.
getType
())))
return
""
.
join
(
output
)
def
_output_dict
(
skw_matches
,
ckw_matches
,
author_keywords
,
acronyms
,
auth_field
=
bconfig
.
CFG_AUTH_FIELD
,
acro_field
=
bconfig
.
CFG_ACRON_FIELD
):
"""Outputs the keywords in the MARCXML format.
@var skw_matches: list of single keywords
@var ckw_matches: list of composite keywords
@var author_keywords: dictionary of extracted author keywords
@var acronyms: dictionary of acronyms
@return: dict, dict of keywords"""
output
=
{}
for
keywords
in
(
skw_matches
,
ckw_matches
):
if
keywords
and
len
(
keywords
):
for
kw
,
info
in
keywords
:
output
[
str
(
kw
)]
=
len
(
info
[
0
])
for
field
,
keywords
in
((
auth_field
,
author_keywords
),
(
acro_field
,
acronyms
)):
if
keywords
and
len
(
keywords
)
and
field
:
# field='' we shall not save the keywords
for
kw
,
info
in
keywords
.
items
():
output
[
str
(
kw
)]
=
len
(
info
)
return
output
def
_output_text
(
skw_matches
=
None
,
ckw_matches
=
None
,
author_keywords
=
None
,
acronyms
=
None
,
spires
=
False
,
only_core_tags
=
False
,
limit
=
bconfig
.
CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER
):
"""Outputs the results obtained in text format.
@var skw_matches: sorted list of single keywords
@var ckw_matches: sorted list of composite keywords
@var author_keywords: dictionary of author keywords
@var acronyms: dictionary of acronyms
@var spires: boolean
@var only_core_tags: boolean
@keyword limit: int, number of printed keywords
@return: str, html formatted output
"""
output
=
[]
if
limit
:
resized_skw
=
skw_matches
[
0
:
limit
]
resized_ckw
=
ckw_matches
[
0
:
limit
]
else
:
resized_skw
=
skw_matches
resized_ckw
=
ckw_matches
if
only_core_tags
:
output
.
append
(
'
\n
Core keywords:
\n
'
+
'
\n
'
.
join
(
_get_core_keywords
(
skw_matches
,
ckw_matches
,
spires
=
spires
)
or
[
'--'
]))
else
:
output
.
append
(
'
\n
Author keywords:
\n
'
+
'
\n
'
.
join
(
_get_author_keywords
(
author_keywords
,
spires
=
spires
)
or
[
'--'
]))
output
.
append
(
'
\n
Composite keywords:
\n
'
+
'
\n
'
.
join
(
_get_compositekws
(
resized_ckw
,
spires
=
spires
)
or
[
'--'
]))
output
.
append
(
'
\n
Single keywords:
\n
'
+
'
\n
'
.
join
(
_get_singlekws
(
resized_skw
,
spires
=
spires
)
or
[
'--'
]))
output
.
append
(
'
\n
Core keywords:
\n
'
+
'
\n
'
.
join
(
_get_core_keywords
(
skw_matches
,
ckw_matches
,
spires
=
spires
)
or
[
'--'
]))
output
.
append
(
'
\n
Field codes:
\n
'
+
'
\n
'
.
join
(
_get_fieldcodes
(
resized_skw
,
resized_ckw
,
spires
=
spires
)
or
[
'--'
]))
output
.
append
(
'
\n
Acronyms:
\n
'
+
'
\n
'
.
join
(
_get_acronyms
(
acronyms
)
or
[
'--'
]))
output
.
append
(
'
\n
--
\n
%s
'
%
_signature
())
return
"
\n
"
.
join
(
output
)
+
"
\n
"
def
_output_html
(
skw_matches
=
None
,
ckw_matches
=
None
,
author_keywords
=
None
,
acronyms
=
None
,
spires
=
False
,
only_core_tags
=
False
,
limit
=
bconfig
.
CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER
,
output
=
""
):
"""Output the same as txt output does, but HTML formatted
@var skw_matches: sorted list of single keywords
@var ckw_matches: sorted list of composite keywords
@var author_keywords: dictionary of extracted author keywords
@var acronyms: dictionary of acronyms
@var spires: boolean
@var only_core_tags: boolean
@keyword limit: int, number of printed keywords
@return: str, html formatted output
"""
output
=
output
.
replace
(
'
\n
'
,
'<br/>'
)
return
"""<html>
<head>
<title>Automatically generated keywords by bibclassify</title>
</head>
<body>
%s
</body>
</html>"""
%
output
def
_get_singlekws
(
skw_matches
,
spires
=
False
):
"""
@var skw_matches: dict of {keyword: [info,...]}
@keyword spires: bool, to get the spires output
@return: list of formatted keywords
"""
output
=
[]
for
single_keyword
,
info
in
skw_matches
:
output
.
append
(
"
%d
%s
"
%
(
len
(
info
[
0
]),
single_keyword
.
output
(
spires
)))
return
output
def
_get_compositekws
(
ckw_matches
,
spires
=
False
):
"""
@var ckw_matches: dict of {keyword: [info,...]}
@keyword spires: bool, to get the spires output
@return: list of formatted keywords
"""
output
=
[]
for
composite_keyword
,
info
in
ckw_matches
:
output
.
append
(
"
%d
%s
%s
"
%
(
len
(
info
[
0
]),
composite_keyword
.
output
(
spires
),
info
[
1
]))
return
output
def
_get_acronyms
(
acronyms
):
"""Returns a formatted list of acronyms."""
acronyms_str
=
[]
if
acronyms
:
for
acronym
,
expansions
in
acronyms
.
iteritems
():
expansions_str
=
", "
.
join
([
"
%s
(
%d
)"
%
expansion
for
expansion
in
expansions
])
acronyms_str
.
append
(
"
%s
%s
"
%
(
acronym
,
expansions_str
))
return
sorted
(
acronyms_str
)
def
_get_author_keywords
(
author_keywords
,
spires
=
False
):
"""Formats the output for the author keywords.
@return: list of formatted author keywors
"""
out
=
[]
if
author_keywords
:
for
keyword
,
matches
in
author_keywords
.
items
():
skw_matches
=
matches
[
0
]
#dictionary of single keywords
ckw_matches
=
matches
[
1
]
#dict of composite keywords
matches_str
=
[]
for
ckw
,
spans
in
ckw_matches
.
items
():
matches_str
.
append
(
'"
%s
"'
%
ckw
.
output
(
spires
))
for
skw
,
spans
in
skw_matches
.
items
():
matches_str
.
append
(
'"
%s
"'
%
skw
.
output
(
spires
))
if
matches_str
:
out
.
append
(
'"
%s
" matches
%s
'
%
(
keyword
,
", "
.
join
(
matches_str
)))
else
:
out
.
append
(
'"
%s
" matches no keyword.'
%
keyword
)
return
sorted
(
out
)
def
_get_fieldcodes
(
skw_matches
,
ckw_matches
,
spires
=
False
):
"""Returns the output for the field codes.
@var skw_matches: dict of {keyword: [info,...]}
@var ckw_matches: dict of {keyword: [info,...]}
@keyword spires: bool, to get the spires output
@return: string"""
fieldcodes
=
{}
output
=
[]
for
skw
,
_
in
skw_matches
:
for
fieldcode
in
skw
.
fieldcodes
:
fieldcodes
.
setdefault
(
fieldcode
,
set
())
.
add
(
skw
.
output
(
spires
))
for
ckw
,
_
in
ckw_matches
:
if
len
(
ckw
.
fieldcodes
):
for
fieldcode
in
ckw
.
fieldcodes
:
fieldcodes
.
setdefault
(
fieldcode
,
set
())
.
add
(
ckw
.
output
(
spires
))
else
:
#inherit field-codes from the composites
for
kw
in
ckw
.
getComponents
():
for
fieldcode
in
kw
.
fieldcodes
:
fieldcodes
.
setdefault
(
fieldcode
,
set
())
.
add
(
'
%s
*'
%
ckw
.
output
(
spires
))
fieldcodes
.
setdefault
(
'*'
,
set
())
.
add
(
kw
.
output
(
spires
))
for
fieldcode
,
keywords
in
fieldcodes
.
items
():
output
.
append
(
'
%s
:
%s
'
%
(
fieldcode
,
', '
.
join
(
keywords
)))
return
sorted
(
output
)
def
_get_core_keywords
(
skw_matches
,
ckw_matches
,
spires
=
False
):
"""Returns the output for the field codes.
@var skw_matches: dict of {keyword: [info,...]}
@var ckw_matches: dict of {keyword: [info,...]}
@keyword spires: bool, to get the spires output
@return: set of formatted core keywords
"""
output
=
set
()
def
_get_value_kw
(
kw
):
'''Inner function to help to sort the Core keywords'''
i
=
0
while
kw
[
i
]
.
isdigit
():
i
+=
1
if
i
>
0
:
return
int
(
kw
[:
i
])
else
:
return
0
for
skw
,
info
in
skw_matches
:
if
skw
.
core
:
output
.
add
(
'
%d
%s
'
%
(
len
(
info
[
0
]),
skw
.
output
(
spires
)))
for
ckw
,
info
in
ckw_matches
:
if
ckw
.
core
:
output
.
add
(
'
%d
%s
'
%
(
len
(
info
[
0
]),
ckw
.
output
(
spires
)))
else
:
#test if one of the components is not core
i
=
0
for
c
in
ckw
.
getComponents
():
if
c
.
core
:
output
.
add
(
'-
%s
(
%s
)'
%
(
c
.
output
(
spires
),
info
[
1
][
i
]))
i
+=
1
return
sorted
(
output
,
key
=
_get_value_kw
,
reverse
=
True
)
def
_filter_core_keywors
(
keywords
):
matches
=
{}
for
kw
,
info
in
keywords
.
items
():
if
kw
.
core
:
matches
[
kw
]
=
info
return
matches
def
_signature
():
"""Prints out the bibclassify signature
@todo: add information about taxonomy, rdflib"""
return
'bibclassify v
%s
'
%
(
bconfig
.
VERSION
,)
def
clean_before_output
(
kw_matches
):
"""Returns a clean copy of the keywords data structure -
ie. stripped off the standalone and other unwanted elements"""
filtered_kw_matches
=
{}
for
kw_match
,
info
in
kw_matches
.
iteritems
():
if
not
kw_match
.
nostandalone
:
filtered_kw_matches
[
kw_match
]
=
info
return
filtered_kw_matches
# ---------------------------------------------------------------------
# helper functions
# ---------------------------------------------------------------------
def
_skw_matches_comparator
(
kw0
,
kw1
):
"""
Compares 2 single keywords objects - first by the number of their
spans (ie. how many times they were found), if it is equal
it compares them by lenghts of their labels.
"""
list_comparison
=
cmp
(
len
(
kw1
[
1
][
0
]),
len
(
kw0
[
1
][
0
]))
if
list_comparison
:
return
list_comparison
if
kw0
[
0
]
.
isComposite
()
and
kw1
[
0
]
.
isComposite
():
component_avg0
=
sum
(
kw0
[
1
][
1
])
/
len
(
kw0
[
1
][
1
])
component_avg1
=
sum
(
kw1
[
1
][
1
])
/
len
(
kw1
[
1
][
1
])
component_comparison
=
cmp
(
component_avg1
,
component_avg0
)
if
component_comparison
:
return
component_comparison
return
cmp
(
len
(
str
(
kw1
[
0
])),
len
(
str
(
kw0
[
0
])))
def
_kw
(
keywords
):
"""Turns list of keywords into dictionary"""
r
=
{}
for
k
,
v
in
keywords
:
r
[
k
]
=
v
return
r
def
_sort_kw_matches
(
skw_matches
,
limit
=
0
):
"""Returns a resized version of data structures of keywords to the
given length."""
sorted_keywords
=
list
(
skw_matches
.
items
())
sorted_keywords
.
sort
(
_skw_matches_comparator
)
return
limit
and
sorted_keywords
[:
limit
]
or
sorted_keywords
def
_get_partial_text
(
fulltext
):
"""Returns a shortened version of the fulltext used with the partial
matching mode. The version is composed of 20% in the beginning and
20% in the middle of the text."""
length
=
len
(
fulltext
)
get_index
=
lambda
x
:
int
(
float
(
x
)
/
100
*
length
)
partial_text
=
[
fulltext
[
get_index
(
start
):
get_index
(
end
)]
for
start
,
end
in
bconfig
.
CFG_BIBCLASSIFY_PARTIAL_TEXT
]
return
"
\n
"
.
join
(
partial_text
)
def
save_keywords
(
filename
,
xml
):
tmp_dir
=
os
.
path
.
dirname
(
filename
)
if
not
os
.
path
.
isdir
(
tmp_dir
):
os
.
mkdir
(
tmp_dir
)
file_desc
=
open
(
filename
,
"w"
)
file_desc
.
write
(
xml
)
file_desc
.
close
()
def
get_tmp_file
(
recid
):
tmp_directory
=
"
%s
/bibclassify"
%
bconfig
.
CFG_TMPDIR
if
not
os
.
path
.
isdir
(
tmp_directory
):
os
.
mkdir
(
tmp_directory
)
filename
=
"bibclassify_
%s
.xml"
%
recid
abs_path
=
os
.
path
.
join
(
tmp_directory
,
filename
)
return
abs_path
def
_parse_marc_code
(
field
):
"""Parses marc field and return default indicators if not filled in"""
field
=
str
(
field
)
if
len
(
field
)
<
4
:
raise
Exception
(
'Wrong field code:
%s
'
%
field
)
else
:
field
+=
'__'
tag
=
field
[
0
:
3
]
ind1
=
field
[
3
]
.
replace
(
'_'
,
''
)
ind2
=
field
[
4
]
.
replace
(
'_'
,
''
)
return
tag
,
ind1
,
ind2
if
__name__
==
"__main__"
:
log
.
error
(
"Please use bibclassify_cli from now on."
)
Event Timeline
Log In to Comment