Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F71053299
engine.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Jul 9, 08:52
Size
57 KB
Mime Type
text/x-python
Expires
Thu, Jul 11, 08:52 (2 d)
Engine
blob
Format
Raw Data
Handle
18900493
Attached To
R3600 invenio-infoscience
engine.py
View Options
## This file is part of Invenio.
## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibMatch - tool to match records with database content of an Invenio instance,
either locally or remotely through invenio_connector."""
__revision__
=
"$Id$"
import
sys
if
sys
.
hexversion
<
0x2040000
:
# pylint: disable=W0622
from
sets
import
Set
as
set
#for "&" intersection
# pylint: enable=W0622
import
os
import
getopt
import
re
import
getpass
from
six
import
iteritems
from
tempfile
import
mkstemp
from
time
import
sleep
from
invenio.config
import
CFG_SITE_SECURE_URL
,
CFG_BIBMATCH_FUZZY_WORDLIMITS
,
\
CFG_BIBMATCH_QUERY_TEMPLATES
,
\
CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT
,
\
CFG_BIBMATCH_LOCAL_SLEEPTIME
,
\
CFG_BIBMATCH_REMOTE_SLEEPTIME
,
\
CFG_SITE_RECORD
,
\
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
from
invenio.legacy.bibmatch.config
import
CFG_BIBMATCH_LOGGER
,
\
CFG_LOGFILE
from
invenio.utils.connector
import
InvenioConnector
,
\
InvenioConnectorAuthError
from
invenio.legacy.bibrecord
import
create_records
,
\
record_get_field_values
,
record_xml_output
,
record_modify_controlfield
,
\
record_has_field
,
record_add_field
from
invenio.legacy.bibconvert
import
api
as
bibconvert
from
invenio.legacy.search_engine
import
get_fieldcodes
from
invenio.legacy.search_engine.query_parser
import
SearchQueryParenthesisedParser
from
invenio.legacy.dbquery
import
run_sql
from
invenio.legacy.bibrecord.scripts.textmarc2xmlmarc
import
transform_file
from
invenio.legacy.bibmatch.validator
import
validate_matches
,
transform_record_to_marc
,
\
validate_tag
from
invenio.utils.text
import
translate_to_ascii
,
xml_entities_to_utf8
try
:
from
cStringIO
import
StringIO
except
ImportError
:
from
StringIO
import
StringIO
re_querystring
=
re
.
compile
(
"\s?([^\s$]*)\[(.+?)\]([^\s$]*).*?"
,
re
.
DOTALL
)
def
usage
():
"""Print help"""
print
>>
sys
.
stderr
,
\
""" BibMatch - match bibliographic data against database, either locally or remotely
Usage: %s [options] [QUERY]
Options:
Output:
-0 --print-new (default) print unmatched in stdout
-1 --print-match print matched records in stdout
-2 --print-ambiguous print records that match more than 1 existing records
-3 --print-fuzzy print records that match the longest words in existing records
-b --batch-output=(filename). filename.new will be new records, filename.matched will be matched,
filename.ambiguous will be ambiguous, filename.fuzzy will be fuzzy match
-t --text-marc-output transform the output to text-marc format instead of the default MARCXML
Simple query:
-q --query-string=(search-query/predefined-query) See "Querystring"-section below.
-f --field=(field)
General options:
-n --noprocess Do not print records in stdout.
-i, --input use a named file instead of stdin for input
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
-r, --remote=URL match against a remote Invenio installation (Full URL, no trailing '/')
Beware: Only searches public records attached to home collection
-a, --alter-recid The recid (controlfield 001) of matched or fuzzy matched records in
output will be replaced by the 001 value of the matched record.
Note: Useful if you want to replace matched records using BibUpload.
-z, --clean clean queries before searching
--no-validation do not perform post-match validation
-h, --help print this help and exit
-V, --version print version information and exit
Advanced options:
-m --mode=(a|e|o|p|r) perform an advanced search using special search mode.
Where mode is:
"a" all of the words,
"o" any of the words,
"e" exact phrase,
"p" partial phrase,
"r" regular expression.
-o --operator(a|o) used to concatenate identical fields in search query (i.e. several report-numbers)
Where operator is:
"a" boolean AND (default)
"o" boolean OR
-c --config=filename load querystrings from a config file. Each line starting with QRYSTR will
be added as a query. i.e. QRYSTR --- [title] [author]
-x --collection only perform queries in certain collection(s).
Note: matching against restricted collections requires authentication.
--user=USERNAME username to use when connecting to Invenio instance. Useful when searching
restricted collections. You will be prompted for password.
QUERYSTRINGS
Querystrings determine which type of query/strategy to use when searching for the
matching records in the database.
Predefined querystrings:
There are some predefined querystrings available:
title - standard title search. (i.e. "this is a title") (default)
title-author - title and author search (i.e. "this is a title AND Lastname, F")
reportnumber - reportnumber search (i.e. reportnumber:REP-NO-123).
You can also add your own predefined querystrings inside invenio.conf file.
You can structure your query in different ways:
* Old-style: fieldnames separated by '||' (conforms with earlier BibMatch versions):
-q "773__p||100__a"
* New-style: Invenio query syntax with "bracket syntax":
-q "773__p:\"[773__p]\" 100__a:[100__a]"
Depending on the structure of the query, it will fetch associated values from each record and put it into
the final search query. i.e in the above example it will put journal-title from 773__p.
When more then one value/datafield is found, i.e. when looking for 700__a (additional authors),
several queries will be put together to make sure all combinations of values are accounted for.
The queries are separated with given operator (-o, --operator) value.
Note: You can add more then one query to a search, just give more (-q, --query-string) arguments.
The results of all queries will be combined when matching.
BibConvert formats:
Another option to further improve your matching strategy is to use BibConvert formats. By using the formats
available by BibConvert you can change the values from the retrieved record-fields.
i.e. using WORDS(1,R) will only return the first (1) word from the right (R). This can be very useful when
adjusting your matching parameters to better match the content. For example only getting authors last-name
instead of full-name.
You can use these formats directly in the querystrings (indicated by '::'):
* Old-style: -q "100__a::WORDS(1,R)::DOWN()"
This query will take first word from the right from 100__a and also convert it to lower-case.
* New-style: -q "100__a:[100__a::WORDS(1,R)::DOWN()]"
See BibConvert documentation for a more detailed explanation of formats.
Predefined fields:
In addition to specifying distinct MARC fields in the querystrings you can use predefined
fields as configured in the LOCAL(!) Invenio system. These fields will then be mapped to one
or more fieldtags to be retrieved from input records.
Common predefined fields used in querystrings: (for Invenio demo site, your fields may vary!)
'abstract', 'affiliation', 'anyfield', 'author', 'coden', 'collaboration',
'collection', 'datecreated', 'datemodified', 'division', 'exactauthor', 'exactfirstauthor',
'experiment', 'fulltext', 'isbn', 'issn', 'journal', 'keyword', 'recid',
'reference', 'reportnumber', 'subject', 'title', 'year'
Examples:
$ bibmatch [options] < input.xml > unmatched.xml
$ bibmatch -b out -n < input.xml
$ bibmatch -a -1 < input.xml > modified_match.xml
$ bibmatch --field=title < input.xml
$ bibmatch --field=245__a --mode=a < input.xml
$ bibmatch --print-ambiguous -q title-author < input.xml > ambigmatched.xml
$ bibmatch -q "980:Thesis 773__p:\"[773__p]\" 100__a:[100__a]" -r "http://inspirebeta.net" < input.xml
$ bibmatch --collection 'Books,Articles' < input.xml
$ bibmatch --collection 'Theses' --user admin < input.xml
"""
%
(
sys
.
argv
[
0
],)
sys
.
exit
(
1
)
return
class
Querystring
:
"""
Holds the information about a querystring.
The object contains lists of fields, formats and queries which generates search queries.
self.fields is a dict of found field-data {"tag": [list of found record data]}
self.formats is a dict of found BibConvert formats {"tag": [list of found format-values]}
self.pattern contains the original search string
self.query contains the generated query
self.operator holds the current active operator, upper-case (OR/AND)
To populate the Querystring instance with values and search string structure,
call create_query(..) with BibRecord structure and a query-string to populate with retrieved values.
Example: The template "title:[245__a]" will retrieve the value from subfield 245__a in
given record. If any BibConvert formats are specified for this field, these will
be applied.
"""
def
__init__
(
self
,
operator
=
"AND"
,
clean
=
False
,
ascii_mode
=
False
):
"""
Creates Querystring instance.
@param operator: operator used to concatenate several queries
@type operator: str
@param clean: indicates if queries should be sanitized
@type clean: bool
"""
self
.
fields
=
{}
self
.
operator
=
operator
.
upper
()
self
.
pattern
=
""
self
.
query
=
""
self
.
clean
=
clean
self
.
ascii_mode
=
ascii_mode
self
.
formats
=
{}
def
create_query
(
self
,
record
,
qrystr
=
"[title]"
):
"""
Main method that parses and generates a search query from
given query-string structure and record data. Returns the
resulting query-string and completeness determination as a tuple.
A query is 'complete' when all found field references has a value
in the passed record. Should a value be missing, the query is
incomplete.
@param record: bibrecord to retrive field-values from
@type record: dict
@param qrystr: proper query string template. (i.e. title:[245__a])
defaults to: [title]
@type qrystr: str
@return: (query-string, complete flag)
@rtype: tuple
"""
if
qrystr
==
""
:
qrystr
=
"[title]"
if
"||"
in
qrystr
or
not
"["
in
qrystr
:
# Assume old style query-strings
qrystr
=
self
.
_convert_qrystr
(
qrystr
)
# FIXME: Convert to lower case, since fuzzy_parser
# which treats everything lower-case, and will cause problems when
# retrieving data from the self.fields dict.
# Also BibConvert formats are currently case sensitive, so we cannot
# force lower-case yet.
self
.
pattern
=
qrystr
.
lower
()
self
.
fields
=
{}
# Extract referenced field-values from given record
complete
,
fieldtags_found
=
self
.
_extract_fieldvalues
(
record
,
qrystr
)
# If no field references are found, we exit as empty query.
if
len
(
self
.
fields
)
==
0
:
self
.
query
=
""
return
self
.
query
,
False
# Now we assemble the found values into a proper search query
all_queries
=
[]
operator_delimiter
=
"
%s
"
%
(
self
.
operator
,)
if
self
.
operator
==
"AND"
:
# We gather all the values from the self.fields and put them
# in a list together with any prefix/suffix associated with the field.
new_query
=
self
.
pattern
for
(
field_prefix
,
field_reference
,
field_suffix
),
value_list
in
iteritems
(
self
.
fields
):
new_values
=
[]
for
value
in
value_list
:
new_values
.
append
(
"
%s%s%s
"
%
(
field_prefix
,
value
,
field_suffix
))
new_query
=
new_query
.
replace
(
"
%s
[
%s
]
%s
"
%
(
field_prefix
,
field_reference
,
field_suffix
),
\
operator_delimiter
.
join
(
set
(
new_values
)))
all_queries
=
[
new_query
]
else
:
# operator is OR, which means a more elaborate approach to multi-value fields
field_tuples
=
[]
for
key
,
values
in
iteritems
(
self
.
fields
):
field_list
=
[]
for
value
in
values
:
# We add key here to be able to associate the value later
field_list
.
append
((
key
,
value
))
field_tuples
.
append
(
field_list
)
# Grab all combinations of queries
query_tuples
=
cproduct
(
field_tuples
)
for
query
in
query_tuples
:
new_query
=
self
.
pattern
for
(
field_prefix
,
field_reference
,
field_suffix
),
value
in
query
:
new_query
=
new_query
.
replace
(
"
%s
[
%s
]
%s
"
%
(
field_prefix
,
field_reference
,
field_suffix
),
\
"
%s%s%s
"
%
(
field_prefix
,
value
,
field_suffix
))
all_queries
.
append
(
new_query
)
# Finally we concatenate all unique queries into one, delimited by chosen operator
self
.
query
=
operator_delimiter
.
join
(
set
(
all_queries
))
if
not
complete
:
# Clean away any leftover field-name references from query
for
fieldtag
in
fieldtags_found
:
self
.
query
=
self
.
query
.
replace
(
"
%s
"
%
(
fieldtag
,),
""
)
# Clean query?
if
self
.
clean
:
self
.
_clean_query
()
return
self
.
query
,
complete
def
fuzzy_queries
(
self
):
"""
Returns a list of queries that are built more 'fuzzily' using the main query as base.
The list returned also contains the current operator in context, so each query is a tuple
of (operator, query).
@return: list of tuples [(operator, query), ..]
@rtype: list [(str, str), ..]
"""
fuzzy_query_list
=
[]
operator_delimiter
=
"
%s
"
%
(
self
.
operator
,)
parser
=
SearchQueryParenthesisedParser
()
query_parts
=
parser
.
parse_query
(
self
.
pattern
)
# Go through every expression in the query and generate fuzzy searches
for
i
in
xrange
(
0
,
len
(
query_parts
)
-
1
,
2
):
current_operator
=
query_parts
[
i
]
current_pattern
=
query_parts
[
i
+
1
]
fieldname_list
=
re_querystring
.
findall
(
current_pattern
)
if
fieldname_list
==
[]:
# No reference to record value, add query 'as is'
fuzzy_query_list
.
append
((
current_operator
,
current_pattern
))
else
:
for
field_prefix
,
field_reference
,
field_suffix
in
fieldname_list
:
for
value
in
self
.
fields
.
get
((
field_prefix
,
field_reference
,
field_suffix
),
[]):
new_query
=
[]
# Grab the x longest words in the string and perform boolean AND for each word
# x is determined by the configuration dict and is tag-based. Defaults to 3 words
word_list
=
get_longest_words
(
value
,
limit
=
CFG_BIBMATCH_FUZZY_WORDLIMITS
.
get
(
field_reference
,
3
))
for
word
in
word_list
:
# Create fuzzy query with key + word, including any surrounding elements like quotes, regexp etc.
new_query
.
append
(
current_pattern
.
replace
(
"[
%s
]"
%
(
field_reference
,),
word
))
fuzzy_query_list
.
append
((
current_operator
,
operator_delimiter
.
join
(
new_query
)))
# Return a list of unique queries
return
list
(
set
(
fuzzy_query_list
))
def
_clean_query
(
self
):
"""
This function will remove erroneous characters and combinations from
a the generated search query that might cause problems when searching.
@return: cleaned query
@rtype: str
"""
#FIXME: Extend cleaning to account for encodings and LaTeX symbols
query
=
self
.
query
.
replace
(
"''"
,
""
)
query
=
query
.
replace
(
'""'
,
""
)
return
query
def
_convert_qrystr
(
self
,
qrystr
):
"""
Converts old-style query-strings into new-style.
"""
fields
=
qrystr
.
split
(
"||"
)
converted_query
=
[]
for
field
in
fields
:
converted_query
.
append
(
"[
%s
]"
%
(
field
,))
return
self
.
operator
.
join
(
converted_query
)
def
_extract_fieldvalues
(
self
,
record
,
qrystr
):
"""
Extract all the values in the given record referenced in the given query-string
and attach them to self.fields as a list. Return boolean indicating if a query
is complete, and a list of all field references found.
Field references is checked to be valid MARC tag references and all values
found are added to self.fields as a list, hashed by the full reference including
prefix and suffix.
If ascii_mode is enabled, the record values will be translated to its ascii
representation.
e.g. for the query-string: 700__a:"[700__a]"
{ ('700__a:"', '700__a', '"') : ["Ellis, J.", "Olive, K. A."]}
Should no values be found for a field references, the query will be flagged
as incomplete.
@param record: bibrecord to retrive field-values from
@type record: dict
@param qrystr: proper query string template. (i.e. title:[245__a])
defaults to: [title]
@type qrystr: str
@return: complete flag, [field references found]
@rtype: tuple
"""
complete
=
True
fieldtags_found
=
[]
# Find all potential references to record tag values and
# add to fields-dict as a list of values using field-name tuple as key.
#
# Each reference will be split into prefix, field-ref and suffix.
# Example:
# 773__p:"[773__p]" 100__a:/.*[100__a].*/ =>
# [('773__p:"', '773__p', '"'), ('100__a:/.*', '100__a', '.*/')]
for
field_prefix
,
field_reference
,
field_suffix
in
re_querystring
.
findall
(
qrystr
):
# First we see if there is any special formats for this field_reference
# The returned value from _extract_formats is the field-name stripped from formats.
# e.g. 245__a::SUP(NUM) => 245__a
fieldname
=
self
.
_extract_formats
(
field_reference
)
# We need everything in lower-case
field_prefix
=
field_prefix
.
lower
()
field_suffix
=
field_suffix
.
lower
()
# Find proper MARC tag(s) for the stripped field-name, if fieldname is used.
# e.g. author -> [100__a, 700__a]
# FIXME: Local instance only!
tag_list
=
get_field_tags_from_fieldname
(
fieldname
)
if
len
(
tag_list
)
==
0
:
tag_list
=
[
fieldname
]
for
field
in
tag_list
:
# Check if it is really a reference to a tag to not confuse with e.g. regex syntax
tag_structure
=
validate_tag
(
field
)
if
tag_structure
!=
None
:
tag
,
ind1
,
ind2
,
code
=
tag_structure
value_list
=
record_get_field_values
(
record
,
tag
,
ind1
,
ind2
,
code
)
if
len
(
value_list
)
>
0
:
# Apply any BibConvert formatting functions to each value
updated_value_list
=
self
.
_apply_formats
(
fieldname
,
value_list
)
# Also remove any errornous XML entities. I.e. & -> &
updated_value_list
=
[
xml_entities_to_utf8
(
v
,
skip
=
[])
\
for
v
in
updated_value_list
]
if
self
.
ascii_mode
:
updated_value_list
=
translate_to_ascii
(
updated_value_list
)
# Store found values linked to full field reference tuple including
# (prefix, field, suffix)
self
.
fields
[(
field_prefix
,
fieldname
,
field_suffix
)]
=
updated_value_list
else
:
# No values found. The query is deemed incomplete
complete
=
False
fieldtags_found
.
append
(
"
%s
[
%s
]
%s
"
%
(
field_prefix
,
fieldname
,
field_suffix
))
return
complete
,
fieldtags_found
def
_extract_formats
(
self
,
field_reference
):
"""
Looks for BibConvert formats within query-strings and adds to
the instance. Formats are defined by one or more '::' followed
by a format keyword which is defined in BibConvert FormatField()
method.
The function also removes the references to formatting functions
in the query (self.pattern)
Returns the field_reference reference, with formats stripped.
"""
field_parts
=
field_reference
.
split
(
"::"
)
if
len
(
field_parts
)
>
1
:
# Remove any references to BibConvert functions in pattern. e.g. 245__a::SUP(PUNCT, ) -> 245__a
# self.pattern is lower cased. Returned value is field-name stripped from formats.
for
aformat
in
field_parts
[
1
:]:
self
.
formats
.
setdefault
(
field_parts
[
0
],
[])
.
append
(
aformat
)
self
.
pattern
=
self
.
pattern
.
replace
(
"[
%s
]"
%
(
field_reference
.
lower
(),),
"[
%s
]"
%
(
field_parts
[
0
],))
return
field_parts
[
0
]
def
_apply_formats
(
self
,
fieldname
,
value_list
):
"""
Apply the current stored BibConvert formating operations for a
field-name to the given list of strings. The list is then returned.
@param fieldname: name of field - used as key in the formats dict
@type fieldname: string
@param value_list: list of strings to apply formats to
@type value_list: list
@return: list of values with formatting functions applied
@rtype: list
"""
if
fieldname
in
self
.
formats
:
new_list
=
[]
for
value
in
value_list
:
if
value
.
strip
()
!=
""
:
# Apply BibConvert formats if applicable
for
aformat
in
self
.
formats
[
fieldname
]:
value
=
bibconvert
.
FormatField
(
value
,
aformat
)
new_list
.
append
(
value
)
return
new_list
else
:
return
value_list
def
get_field_tags_from_fieldname
(
field
):
"""
Gets list of field 'field' for the record with 'sysno' system number from the database.
"""
query
=
"select tag.value from tag left join field_tag on tag.id=field_tag.id_tag "
\
+
"left join field on field_tag.id_field=field.id where field.code='
%s
'"
%
(
field
,)
out
=
[]
res
=
run_sql
(
query
)
for
row
in
res
:
out
.
append
(
row
[
0
])
return
out
def
cproduct
(
args
):
"""
Returns the Cartesian product of passed arguments as a list of tuples.
'12','34' -> ('1', '3'), ('1', '4'), ('2', '3'), ('2', '4')
@param args: iterable with elements to compute
@type args: iterable
@return list containing tuples for each computed combination
@rtype list of tuples
Based on http://docs.python.org/library/itertools.html#itertools.product
"""
values
=
map
(
tuple
,
args
)
result
=
[[]]
for
value
in
values
:
result
=
[
x
+
[
y
]
for
x
in
result
for
y
in
value
]
return
[
tuple
(
res
)
for
res
in
result
]
def
bylen
(
word1
,
word2
):
""" Sort comparison method that compares by length """
return
len
(
word1
)
-
len
(
word2
)
def
get_longest_words
(
wstr
,
limit
=
5
):
"""
Select the longest words for matching. It selects the longest words from
the string, according to a given limit of words. By default the 5 longest word are selected
@param wstr: string to extract the longest words from
@type wstr: str
@param limit: maximum number of words extracted
@type limit: int
@return: list of long words
@rtype: list
"""
words
=
[]
if
wstr
:
words
=
wstr
.
split
()
words
.
sort
(
cmp
=
bylen
)
words
.
reverse
()
words
=
words
[:
limit
]
return
words
def
add_recid
(
record
,
recid
):
"""
Add a given record-id to the record as $$001 controlfield. If an 001 field already
exists it will be replaced.
@param record: the record to retrive field-values from
@type record: a bibrecord instance
@param recid: record-id to be added
@type recid: int
"""
if
record_has_field
(
record
,
'001'
):
record_modify_controlfield
(
record
,
'001'
,
\
controlfield_value
=
str
(
recid
),
\
field_position_global
=
1
)
else
:
record_add_field
(
record
,
'001'
,
controlfield_value
=
str
(
recid
))
def
match_result_output
(
bibmatch_recid
,
recID_list
,
server_url
,
query
,
matchmode
=
"no match"
):
"""
Generates result as XML comments from passed record and matching parameters.
@param bibmatch_recid: BibMatch record identifier
@type bibmatch_recid: int
@param recID_list: record matched with record
@type recID_list: list
@param server_url: url to the server the matching has been performed
@type server_url: str
@param query: matching query
@type query: str
@param matchmode: matching type
@type matchmode: str
@rtype str
@return XML result string
"""
result
=
[
"<!-- BibMatch-Matching-Results: -->"
,
\
"<!-- BibMatch-Matching-Record-Identifier:
%s
-->"
%
(
bibmatch_recid
,)]
for
recID
in
recID_list
:
result
.
append
(
"<!-- BibMatch-Matching-Found:
%s
/
%s
/
%s
-->"
\
%
(
server_url
,
CFG_SITE_RECORD
,
recID
))
result
.
append
(
"<!-- BibMatch-Matching-Mode:
%s
-->"
\
%
(
matchmode
,))
result
.
append
(
"<!-- BibMatch-Matching-Criteria:
%s
-->"
\
%
(
query
,))
return
"
\n
"
.
join
(
result
)
def
match_records
(
records
,
qrystrs
=
None
,
search_mode
=
None
,
operator
=
"and"
,
\
verbose
=
1
,
server_url
=
CFG_SITE_SECURE_URL
,
modify
=
0
,
\
sleeptime
=
CFG_BIBMATCH_LOCAL_SLEEPTIME
,
\
clean
=
False
,
collections
=
[],
user
=
""
,
password
=
""
,
\
fuzzy
=
True
,
validate
=
True
,
ascii_mode
=
False
):
"""
Match passed records with existing records on a local or remote Invenio
installation. Returns which records are new (no match), which are matched,
which are ambiguous and which are fuzzy-matched. A formatted result of each
records matching are appended to each record tuple:
(record, status_code, list_of_errors, result)
@param records: records to analyze
@type records: list of records
@param qrystrs: list of tuples (field, querystring)
@type qrystrs: list
@param search_mode: if mode is given, the search will perform an advanced
query using the desired mode. Otherwise 'simple search'
is used.
@type search_mode: str
@param operator: operator used to concatenate values of fields occurring more then once.
Valid types are: AND, OR. Defaults to AND.
@type operator: str
@param verbose: be loud
@type verbose: int
@param server_url: which server to search on. Local installation by default
@type server_url: str
@param modify: output modified records of matches
@type modify: int
@param sleeptime: amount of time to wait between each query
@type sleeptime: float
@param clean: should the search queries be cleaned before passed them along?
@type clean: bool
@param collections: list of collections to search, if specified
@type collections: list
@param user: username in case of authenticated search requests
@type user: string
@param password: password in case of authenticated search requests
@type password: string
@param fuzzy: True to activate fuzzy query matching step
@type fuzzy: bool
@param validate: True to activate match validation
@type validate: bool
@param ascii_mode: True to transform values to its ascii representation
@type ascii_mode: bool
@rtype: list of lists
@return an array of arrays of records, like this [newrecs,matchedrecs,
ambiguousrecs,fuzzyrecs]
"""
newrecs
=
[]
matchedrecs
=
[]
ambiguousrecs
=
[]
fuzzyrecs
=
[]
CFG_BIBMATCH_LOGGER
.
info
(
"-- BibMatch starting match of
%d
records --"
%
(
len
(
records
),))
try
:
server
=
InvenioConnector
(
server_url
,
user
=
user
,
password
=
password
)
except
InvenioConnectorAuthError
as
error
:
if
verbose
>
0
:
sys
.
stderr
.
write
(
"Authentication error when connecting to server:
%s
"
\
%
(
str
(
error
),))
CFG_BIBMATCH_LOGGER
.
info
(
"-- BibMatch ending match with errors (AuthError) --"
)
return
[
newrecs
,
matchedrecs
,
ambiguousrecs
,
fuzzyrecs
]
## Go through each record and try to find matches using defined querystrings
record_counter
=
0
for
record
in
records
:
record_counter
+=
1
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"
\n
Processing record: #
%d
.."
%
(
record_counter
,))
# At least one (field, querystring) tuple is needed for default search query
if
not
qrystrs
:
qrystrs
=
[(
""
,
""
)]
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Started"
%
(
record_counter
,))
[
matched_results
,
ambiguous_results
,
fuzzy_results
]
=
match_record
(
bibmatch_recid
=
record_counter
,
record
=
record
[
0
],
server
=
server
,
qrystrs
=
qrystrs
,
search_mode
=
search_mode
,
operator
=
operator
,
verbose
=
verbose
,
sleeptime
=
sleeptime
,
clean
=
clean
,
collections
=
collections
,
fuzzy
=
fuzzy
,
validate
=
validate
,
ascii_mode
=
ascii_mode
)
## Evaluate final results for record
# Add matched record iff number found is equal to one, otherwise return fuzzy,
# ambiguous or no match
if
len
(
matched_results
)
==
1
:
results
,
query
=
matched_results
[
0
]
# If one match, add it as exact match, otherwise ambiguous
if
len
(
results
)
==
1
:
if
modify
:
add_recid
(
record
[
0
],
results
[
0
])
matchedrecs
.
append
((
record
[
0
],
match_result_output
(
record_counter
,
results
,
server_url
,
\
query
,
"exact-matched"
)))
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"Final result: match -
%s
/record/
%d
\n
"
%
(
server_url
,
results
[
0
]))
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Completed as 'match'"
%
(
record_counter
,))
else
:
ambiguousrecs
.
append
((
record
[
0
],
match_result_output
(
record_counter
,
results
,
server_url
,
\
query
,
"ambiguous-matched"
)))
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"Final result: ambiguous
\n
"
)
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Completed as 'ambiguous'"
%
(
record_counter
,))
else
:
if
len
(
fuzzy_results
)
>
0
:
# Find common record-id for all fuzzy results and grab first query
# as "representative" query
query
=
fuzzy_results
[
0
][
1
]
result_lists
=
[]
for
res
,
dummy
in
fuzzy_results
:
result_lists
.
extend
(
res
)
results
=
set
([
res
for
res
in
result_lists
])
if
len
(
results
)
==
1
:
fuzzyrecs
.
append
((
record
[
0
],
match_result_output
(
record_counter
,
results
,
server_url
,
\
query
,
"fuzzy-matched"
)))
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"Final result: fuzzy
\n
"
)
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Completed as 'fuzzy'"
%
(
record_counter
,))
else
:
ambiguousrecs
.
append
((
record
[
0
],
match_result_output
(
record_counter
,
results
,
server_url
,
\
query
,
"ambiguous-matched"
)))
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"Final result: ambiguous
\n
"
)
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Completed as 'ambiguous'"
%
(
record_counter
,))
elif
len
(
ambiguous_results
)
>
0
:
# Find common record-id for all ambiguous results and grab first query
# as "representative" query
query
=
ambiguous_results
[
0
][
1
]
result_lists
=
[]
for
res
,
dummy
in
ambiguous_results
:
result_lists
.
extend
(
res
)
results
=
set
([
res
for
res
in
result_lists
])
ambiguousrecs
.
append
((
record
[
0
],
match_result_output
(
record_counter
,
results
,
server_url
,
\
query
,
"ambiguous-matched"
)))
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"Final result: ambiguous
\n
"
)
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Completed as 'ambiguous'"
%
(
record_counter
,))
else
:
newrecs
.
append
((
record
[
0
],
match_result_output
(
record_counter
,
[],
server_url
,
str
(
qrystrs
))))
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"Final result: new
\n
"
)
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Completed as 'new'"
%
(
record_counter
,))
CFG_BIBMATCH_LOGGER
.
info
(
"-- BibMatch ending match: New(
%d
), Matched(
%d
), Ambiguous(
%d
), Fuzzy(
%d
) --"
%
\
(
len
(
newrecs
),
len
(
matchedrecs
),
len
(
ambiguousrecs
),
len
(
fuzzyrecs
)))
return
[
newrecs
,
matchedrecs
,
ambiguousrecs
,
fuzzyrecs
]
def
match_record
(
bibmatch_recid
,
record
,
server
,
qrystrs
=
None
,
search_mode
=
None
,
operator
=
"and"
,
\
verbose
=
1
,
sleeptime
=
CFG_BIBMATCH_LOCAL_SLEEPTIME
,
\
clean
=
False
,
collections
=
[],
fuzzy
=
True
,
validate
=
True
,
\
ascii_mode
=
False
):
"""
Matches a single record.
@param bibmatch_recid: Current record number. Used for logging.
@type bibmatch_recid: int
@param record: record to match in BibRecord structure
@type record: dict
@param server: InvenioConnector server object
@type server: object
@param qrystrs: list of tuples (field, querystring)
@type qrystrs: list
@param search_mode: if mode is given, the search will perform an advanced
query using the desired mode. Otherwise 'simple search'
is used.
@type search_mode: str
@param operator: operator used to concatenate values of fields occurring more then once.
Valid types are: AND, OR. Defaults to AND.
@type operator: str
@param verbose: be loud
@type verbose: int
@param server_url: which server to search on. Local installation by default
@type server_url: str
@param sleeptime: amount of time to wait between each query
@type sleeptime: float
@param clean: should the search queries be cleaned before passed them along?
@type clean: bool
@param collections: list of collections to search, if specified
@type collections: list
@param fuzzy: True to activate fuzzy query matching step
@type fuzzy: bool
@param validate: True to activate match validation
@type validate: bool
@param ascii_mode: True to transform values to its ascii representation
@type ascii_mode: bool
"""
matched_results
=
[]
ambiguous_results
=
[]
fuzzy_results
=
[]
# Keep a list of generated querystring objects for later use in fuzzy match
query_list
=
[]
# Go through each querystring, trying to find a matching record
# Stops on first valid match, if no exact-match we continue with fuzzy match
for
field
,
qrystr
in
qrystrs
:
querystring
=
Querystring
(
operator
,
clean
=
clean
,
ascii_mode
=
ascii_mode
)
query
,
complete
=
querystring
.
create_query
(
record
,
qrystr
)
if
query
==
""
:
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"
\n
Empty query. Skipping...
\n
"
)
# Empty query, no point searching database
continue
query_list
.
append
((
querystring
,
complete
,
field
))
if
not
complete
:
if
(
verbose
>
1
):
sys
.
stderr
.
write
(
"
\n
Query not complete. Flagged as uncertain/ambiguous...
\n
"
)
# Determine proper search parameters
if
search_mode
!=
None
:
search_params
=
dict
(
p1
=
query
,
f1
=
field
,
m1
=
search_mode
,
of
=
'id'
,
c
=
collections
)
else
:
search_params
=
dict
(
p
=
query
,
f
=
field
,
of
=
'id'
,
c
=
collections
)
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"
\n
Searching with values
%s
\n
"
%
(
search_params
,))
## Perform the search with retries
try
:
result_recids
=
server
.
search_with_retry
(
**
search_params
)
except
InvenioConnectorAuthError
as
error
:
if
verbose
>
0
:
sys
.
stderr
.
write
(
"Authentication error when searching:
%s
"
\
%
(
str
(
error
),))
break
sleep
(
sleeptime
)
## Check results:
if
len
(
result_recids
)
>
0
:
# Matches detected
if
len
(
result_recids
)
>
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
:
# Too many matches, treat as non-match
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"result=More then
%d
results...
\n
"
%
\
(
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
,))
continue
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"result=
%s
\n
"
%
(
result_recids
,))
if
validate
:
# Validation can be run
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Query (
%s
) found
%d
records:
%s
"
%
\
(
bibmatch_recid
,
query
,
len
(
result_recids
),
str
(
result_recids
)))
exact_matches
,
fuzzy_matches
=
validate_matches
(
bibmatch_recid
=
bibmatch_recid
,
\
record
=
record
,
\
server
=
server
,
\
result_recids
=
result_recids
,
\
collections
=
collections
,
\
verbose
=
verbose
,
\
ascii_mode
=
ascii_mode
)
if
len
(
exact_matches
)
>
0
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match validated
\n
"
)
matched_results
.
append
((
exact_matches
,
query
))
break
elif
len
(
fuzzy_matches
)
>
0
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match validated fuzzily
\n
"
)
fuzzy_results
.
append
((
fuzzy_matches
,
query
))
continue
else
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match could not be validated
\n
"
)
else
:
# No validation
# Ambiguous match
if
len
(
result_recids
)
>
1
:
ambiguous_results
.
append
((
result_recids
,
query
))
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Ambiguous
\n
"
)
continue
# Match
elif
len
(
result_recids
)
==
1
:
if
complete
:
matched_results
.
append
((
result_recids
,
query
))
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match
\n
"
)
# This was a complete match, so let's break out to avoid more searching
break
else
:
# We treat the result as ambiguous (uncertain) when query is not complete
# and we are not validating it.
ambiguous_results
.
append
((
result_recids
,
query
))
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Ambiguous
\n
"
)
continue
# No match
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"result=No matches
\n
"
)
# No complete matches, lets try fuzzy matching of all the queries
else
:
if
fuzzy
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"
\n
Fuzzy query mode...
\n
"
)
## Fuzzy matching: Analyze all queries and perform individual searches, then intersect results.
for
querystring
,
complete
,
field
in
query_list
:
result_hitset
=
None
fuzzy_query_list
=
querystring
.
fuzzy_queries
()
empty_results
=
0
# Go through every expression in the query and generate fuzzy searches
for
current_operator
,
qry
in
fuzzy_query_list
:
current_resultset
=
None
search_params
=
dict
(
p
=
qry
,
f
=
field
,
of
=
'id'
,
c
=
collections
)
try
:
current_resultset
=
server
.
search_with_retry
(
**
search_params
)
except
InvenioConnectorAuthError
as
error
:
if
(
verbose
>
0
):
sys
.
stderr
.
write
(
"Authentication error when searching:
%s
"
\
%
(
str
(
error
),))
break
if
(
verbose
>
8
):
if
len
(
current_resultset
)
>
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
:
sys
.
stderr
.
write
(
"
\n
Searching with values
%s
result=
%s
\n
"
%
(
search_params
,
"More then
%d
results..."
%
\
(
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
,)))
else
:
sys
.
stderr
.
write
(
"
\n
Searching with values
%s
result=
%s
\n
"
%
(
search_params
,
current_resultset
))
sleep
(
sleeptime
)
if
current_resultset
==
None
:
continue
if
current_resultset
==
[]
and
empty_results
<
CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT
:
# Allows some empty results
empty_results
+=
1
else
:
# Intersect results with previous results depending on current operator
if
result_hitset
==
None
:
result_hitset
=
current_resultset
if
current_operator
==
'+'
:
result_hitset
=
list
(
set
(
result_hitset
)
&
set
(
current_resultset
))
elif
current_operator
==
'-'
:
result_hitset
=
list
(
set
(
result_hitset
)
-
set
(
current_resultset
))
elif
current_operator
==
'|'
:
result_hitset
=
list
(
set
(
result_hitset
)
|
set
(
current_resultset
))
else
:
if
result_hitset
and
len
(
result_hitset
)
<
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
:
# This was a fuzzy match
query_out
=
" "
.
join
([
"
%s
%s
"
%
(
op
,
qu
)
for
op
,
qu
in
fuzzy_query_list
])
if
validate
:
# We can run validation
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Query (
%s
) found
%d
records:
%s
"
%
\
(
bibmatch_recid
,
query_out
,
len
(
result_hitset
),
str
(
result_hitset
)))
exact_matches
,
fuzzy_matches
=
validate_matches
(
bibmatch_recid
=
bibmatch_recid
,
\
record
=
record
,
\
server
=
server
,
\
result_recids
=
result_hitset
,
\
collections
=
collections
,
\
verbose
=
verbose
,
\
ascii_mode
=
ascii_mode
)
if
len
(
exact_matches
)
>
0
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match validated
\n
"
)
matched_results
.
append
((
exact_matches
,
query_out
))
break
elif
len
(
fuzzy_matches
)
>
0
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match validated fuzzily
\n
"
)
fuzzy_results
.
append
((
fuzzy_matches
,
query_out
))
else
:
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Match could not be validated
\n
"
)
else
:
# No validation
if
len
(
result_hitset
)
==
1
and
complete
:
fuzzy_results
.
append
((
result_hitset
,
query_out
))
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Fuzzy:
%s
\n
"
%
(
result_hitset
,))
else
:
# We treat the result as ambiguous (uncertain) when:
# - query is not complete
# - more then one result
ambiguous_results
.
append
((
result_hitset
,
query_out
))
if
(
verbose
>
8
):
sys
.
stderr
.
write
(
"Ambiguous
\n
"
)
return
[
matched_results
,
ambiguous_results
,
fuzzy_results
]
def
transform_input_to_marcxml
(
filename
=
None
,
file_input
=
""
):
"""
Takes the filename or input of text-marc and transforms it
to MARCXML.
"""
if
not
filename
:
# Create temporary file to read from
tmp_fd
,
filename
=
mkstemp
()
os
.
write
(
tmp_fd
,
file_input
)
os
.
close
(
tmp_fd
)
try
:
# Redirect output, transform, restore old references
old_stdout
=
sys
.
stdout
new_stdout
=
StringIO
()
sys
.
stdout
=
new_stdout
transform_file
(
filename
)
finally
:
sys
.
stdout
=
old_stdout
return
new_stdout
.
getvalue
()
def
bibrecs_has_errors
(
bibrecs
):
"""
Utility function to check a list of parsed BibRec objects, directly
from the output of bibrecord.create_records(), for any
badly parsed records.
If an error-code is present in the result the function will return True,
otherwise False.
"""
return
0
in
[
err_code
for
dummy
,
err_code
,
dummy2
in
bibrecs
]
def
main
():
"""
Record matches database content when defined search gives
exactly one record in the result set. By default the match is
done on the title field.
"""
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
"0123hVm:fq:c:nv:o:b:i:r:tazx:"
,
[
"print-new"
,
"print-match"
,
"print-ambiguous"
,
"print-fuzzy"
,
"help"
,
"version"
,
"mode="
,
"field="
,
"query-string="
,
"config="
,
"no-process"
,
"verbose="
,
"operator="
,
"batch-output="
,
"input="
,
"remote="
,
"text-marc-output"
,
"alter-recid"
,
"clean"
,
"collection="
,
"user="
,
"no-fuzzy"
,
"no-validation"
,
"ascii"
])
except
getopt
.
GetoptError
as
e
:
usage
()
match_results
=
[]
qrystrs
=
[]
# list of query strings
print_mode
=
0
# default match mode to print new records
noprocess
=
0
# dump result in stdout?
operator
=
"and"
verbose
=
1
# 0..be quiet
records
=
[]
batch_output
=
""
# print stuff in files
f_input
=
""
# read from where, if param "i"
server_url
=
CFG_SITE_SECURE_URL
# url to server performing search, local by default
modify
=
0
# alter output with matched record identifiers
textmarc_output
=
0
# output in MARC instead of MARCXML
field
=
""
search_mode
=
None
# activates a mode, uses advanced search instead of simple
sleeptime
=
CFG_BIBMATCH_LOCAL_SLEEPTIME
# the amount of time to sleep between queries, changes on remote queries
clean
=
False
# should queries be sanitized?
collections
=
[]
# only search certain collections?
user
=
""
password
=
""
validate
=
True
# should matches be validate?
fuzzy
=
True
# Activate fuzzy-mode if no matches found for a record
ascii_mode
=
False
# Should values be turned into ascii mode
for
opt
,
opt_value
in
opts
:
if
opt
in
[
"-0"
,
"--print-new"
]:
print_mode
=
0
if
opt
in
[
"-1"
,
"--print-match"
]:
print_mode
=
1
if
opt
in
[
"-2"
,
"--print-ambiguous"
]:
print_mode
=
2
if
opt
in
[
"-3"
,
"--print-fuzzy"
]:
print_mode
=
3
if
opt
in
[
"-n"
,
"--no-process"
]:
noprocess
=
1
if
opt
in
[
"-h"
,
"--help"
]:
usage
()
sys
.
exit
(
0
)
if
opt
in
[
"-V"
,
"--version"
]:
print
__revision__
sys
.
exit
(
0
)
if
opt
in
[
"-t"
,
"--text-marc-output"
]:
textmarc_output
=
1
if
opt
in
[
"-v"
,
"--verbose"
]:
verbose
=
int
(
opt_value
)
if
opt
in
[
"-f"
,
"--field"
]:
if
opt_value
in
get_fieldcodes
():
field
=
opt_value
if
opt
in
[
"-q"
,
"--query-string"
]:
try
:
template
=
CFG_BIBMATCH_QUERY_TEMPLATES
[
opt_value
]
qrystrs
.
append
((
field
,
template
))
except
KeyError
:
qrystrs
.
append
((
field
,
opt_value
))
if
opt
in
[
"-m"
,
"--mode"
]:
search_mode
=
opt_value
if
opt
in
[
"-o"
,
"--operator"
]:
if
opt_value
.
lower
()
in
[
"o"
,
"or"
,
"|"
]:
operator
=
"or"
elif
opt_value
.
lower
()
in
[
"a"
,
"and"
,
"&"
]:
operator
=
"and"
if
opt
in
[
"-b"
,
"--batch-output"
]:
batch_output
=
opt_value
if
opt
in
[
"-i"
,
"--input"
]:
f_input
=
opt_value
if
opt
in
[
"-r"
,
"--remote"
]:
server_url
=
opt_value
sleeptime
=
CFG_BIBMATCH_REMOTE_SLEEPTIME
if
opt
in
[
"-a"
,
"--alter-recid"
]:
modify
=
1
if
opt
in
[
"-z"
,
"--clean"
]:
clean
=
True
if
opt
in
[
"-c"
,
"--config"
]:
config_file
=
opt_value
config_file_read
=
bibconvert
.
read_file
(
config_file
,
0
)
for
line
in
config_file_read
:
tmp
=
line
.
split
(
"---"
)
if
(
tmp
[
0
]
==
"QRYSTR"
):
qrystrs
.
append
((
field
,
tmp
[
1
]))
if
opt
in
[
"-x"
,
"--collection"
]:
colls
=
opt_value
.
split
(
','
)
for
collection
in
colls
:
if
collection
not
in
collections
:
collections
.
append
(
collection
)
if
opt
in
[
"--user"
]:
user
=
opt_value
password
=
getpass
.
getpass
()
if
opt
==
"--no-fuzzy"
:
fuzzy
=
False
if
opt
==
"--no-validation"
:
validate
=
False
if
opt
==
"--ascii"
:
ascii_mode
=
True
if
verbose
:
sys
.
stderr
.
write
(
"
\n
BibMatch: Parsing input file
%s
..."
%
(
f_input
,))
read_list
=
[]
if
not
f_input
:
for
line_in
in
sys
.
stdin
:
read_list
.
append
(
line_in
)
else
:
f
=
open
(
f_input
)
for
line_in
in
f
:
read_list
.
append
(
line_in
)
f
.
close
()
file_read
=
""
.
join
(
read_list
)
# Detect input type
if
not
file_read
.
strip
()
.
startswith
(
'<'
):
# Not xml, assume type textmarc
file_read
=
transform_input_to_marcxml
(
f_input
,
file_read
)
records
=
create_records
(
file_read
)
if
len
(
records
)
==
0
:
if
verbose
:
sys
.
stderr
.
write
(
"
\n
BibMatch: Input file contains no records.
\n
"
)
sys
.
exit
(
1
)
# Check for any parsing errors in records
if
bibrecs_has_errors
(
records
):
# Errors found. Let's try to remove any XML entities
if
verbose
>
8
:
sys
.
stderr
.
write
(
"
\n
BibMatch: Parsing error. Trying removal of XML entities..
\n
"
)
file_read
=
xml_entities_to_utf8
(
file_read
)
records
=
create_records
(
file_read
)
if
bibrecs_has_errors
(
records
):
# Still problems.. alert the user and exit
if
verbose
:
errors
=
"
\n
"
.
join
([
str
(
err_msg
)
for
dummy
,
err_code
,
err_msg
in
records
\
if
err_code
==
0
])
sys
.
stderr
.
write
(
"
\n
BibMatch: Errors during record parsing:
\n
%s
\n
"
%
\
(
errors
,))
sys
.
exit
(
1
)
if
verbose
:
sys
.
stderr
.
write
(
"read
%d
records"
%
(
len
(
records
),))
sys
.
stderr
.
write
(
"
\n
BibMatch: Matching ..."
)
if
not
validate
:
if
verbose
:
sys
.
stderr
.
write
(
"
\n
WARNING: Skipping match validation.
\n
"
)
match_results
=
match_records
(
records
=
records
,
qrystrs
=
qrystrs
,
search_mode
=
search_mode
,
operator
=
operator
,
verbose
=
verbose
,
server_url
=
server_url
,
modify
=
modify
,
sleeptime
=
sleeptime
,
clean
=
clean
,
collections
=
collections
,
user
=
user
,
password
=
password
,
fuzzy
=
fuzzy
,
validate
=
validate
,
ascii_mode
=
ascii_mode
)
# set the output according to print..
# 0-newrecs 1-matchedrecs 2-ambiguousrecs 3-fuzzyrecs
recs_out
=
match_results
[
print_mode
]
if
verbose
:
sys
.
stderr
.
write
(
"
\n\n
Bibmatch report
\n
"
)
sys
.
stderr
.
write
(
"="
*
35
)
sys
.
stderr
.
write
(
"
\n
New records :
%d
"
%
(
len
(
match_results
[
0
]),))
sys
.
stderr
.
write
(
"
\n
Matched records :
%d
"
%
(
len
(
match_results
[
1
]),))
sys
.
stderr
.
write
(
"
\n
Ambiguous records :
%d
"
%
(
len
(
match_results
[
2
]),))
sys
.
stderr
.
write
(
"
\n
Fuzzy records :
%d
\n
"
%
(
len
(
match_results
[
3
]),))
sys
.
stderr
.
write
(
"="
*
35
)
sys
.
stderr
.
write
(
"
\n
Total records :
%d
\n
"
%
(
len
(
records
),))
sys
.
stderr
.
write
(
"
\n
See detailed log at
%s
\n
"
%
(
CFG_LOGFILE
,))
if
not
noprocess
and
recs_out
:
print
'<collection xmlns="http://www.loc.gov/MARC21/slim">'
for
record
,
results
in
recs_out
:
if
textmarc_output
:
# FIXME: textmarc output does not print matching results
print
transform_record_to_marc
(
record
)
else
:
print
results
print
record_xml_output
(
record
)
print
"</collection>"
if
batch_output
:
i
=
0
outputs
=
[
'new'
,
'matched'
,
'ambiguous'
,
'fuzzy'
]
for
result
in
match_results
:
out
=
[]
out
.
append
(
'<collection xmlns="http://www.loc.gov/MARC21/slim">'
)
for
record
,
results
in
result
:
if
textmarc_output
:
# FIXME: textmarc output does not print matching results
out
.
append
(
transform_record_to_marc
(
record
))
else
:
out
.
append
(
results
)
out
.
append
(
record_xml_output
(
record
))
out
.
append
(
"</collection>"
)
filename
=
"
%s
.
%s
.xml"
%
(
batch_output
,
outputs
[
i
])
file_fd
=
open
(
filename
,
"w"
)
file_fd
.
write
(
"
\n
"
.
join
(
out
))
file_fd
.
close
()
i
+=
1
Event Timeline
Log In to Comment