Page MenuHomec4science

engine.py
No OneTemporary

File Metadata

Created
Tue, Jul 9, 08:52

engine.py

## This file is part of Invenio.
## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibMatch - tool to match records with database content of an Invenio instance,
either locally or remotely through invenio_connector."""
__revision__ = "$Id$"
import sys
if sys.hexversion < 0x2040000:
# pylint: disable=W0622
from sets import Set as set #for "&" intersection
# pylint: enable=W0622
import os
import getopt
import re
import getpass
from six import iteritems
from tempfile import mkstemp
from time import sleep
from invenio.config import CFG_SITE_SECURE_URL, CFG_BIBMATCH_FUZZY_WORDLIMITS, \
CFG_BIBMATCH_QUERY_TEMPLATES, \
CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT, \
CFG_BIBMATCH_LOCAL_SLEEPTIME, \
CFG_BIBMATCH_REMOTE_SLEEPTIME, \
CFG_SITE_RECORD, \
CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT
from invenio.legacy.bibmatch.config import CFG_BIBMATCH_LOGGER, \
CFG_LOGFILE
from invenio.utils.connector import InvenioConnector, \
InvenioConnectorAuthError
from invenio.legacy.bibrecord import create_records, \
record_get_field_values, record_xml_output, record_modify_controlfield, \
record_has_field, record_add_field
from invenio.legacy.bibconvert import api as bibconvert
from invenio.legacy.search_engine import get_fieldcodes
from invenio.legacy.search_engine.query_parser import SearchQueryParenthesisedParser
from invenio.legacy.dbquery import run_sql
from invenio.legacy.bibrecord.scripts.textmarc2xmlmarc import transform_file
from invenio.legacy.bibmatch.validator import validate_matches, transform_record_to_marc, \
validate_tag
from invenio.utils.text import translate_to_ascii, xml_entities_to_utf8
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
re_querystring = re.compile("\s?([^\s$]*)\[(.+?)\]([^\s$]*).*?", re.DOTALL)
def usage():
"""Print help"""
print >> sys.stderr, \
""" BibMatch - match bibliographic data against database, either locally or remotely
Usage: %s [options] [QUERY]
Options:
Output:
-0 --print-new (default) print unmatched in stdout
-1 --print-match print matched records in stdout
-2 --print-ambiguous print records that match more than 1 existing records
-3 --print-fuzzy print records that match the longest words in existing records
-b --batch-output=(filename). filename.new will be new records, filename.matched will be matched,
filename.ambiguous will be ambiguous, filename.fuzzy will be fuzzy match
-t --text-marc-output transform the output to text-marc format instead of the default MARCXML
Simple query:
-q --query-string=(search-query/predefined-query) See "Querystring"-section below.
-f --field=(field)
General options:
-n --noprocess Do not print records in stdout.
-i, --input use a named file instead of stdin for input
-v, --verbose=LEVEL verbose level (from 0 to 9, default 1)
-r, --remote=URL match against a remote Invenio installation (Full URL, no trailing '/')
Beware: Only searches public records attached to home collection
-a, --alter-recid The recid (controlfield 001) of matched or fuzzy matched records in
output will be replaced by the 001 value of the matched record.
Note: Useful if you want to replace matched records using BibUpload.
-z, --clean clean queries before searching
--no-validation do not perform post-match validation
-h, --help print this help and exit
-V, --version print version information and exit
Advanced options:
-m --mode=(a|e|o|p|r) perform an advanced search using special search mode.
Where mode is:
"a" all of the words,
"o" any of the words,
"e" exact phrase,
"p" partial phrase,
"r" regular expression.
-o --operator(a|o) used to concatenate identical fields in search query (i.e. several report-numbers)
Where operator is:
"a" boolean AND (default)
"o" boolean OR
-c --config=filename load querystrings from a config file. Each line starting with QRYSTR will
be added as a query. i.e. QRYSTR --- [title] [author]
-x --collection only perform queries in certain collection(s).
Note: matching against restricted collections requires authentication.
--user=USERNAME username to use when connecting to Invenio instance. Useful when searching
restricted collections. You will be prompted for password.
QUERYSTRINGS
Querystrings determine which type of query/strategy to use when searching for the
matching records in the database.
Predefined querystrings:
There are some predefined querystrings available:
title - standard title search. (i.e. "this is a title") (default)
title-author - title and author search (i.e. "this is a title AND Lastname, F")
reportnumber - reportnumber search (i.e. reportnumber:REP-NO-123).
You can also add your own predefined querystrings inside invenio.conf file.
You can structure your query in different ways:
* Old-style: fieldnames separated by '||' (conforms with earlier BibMatch versions):
-q "773__p||100__a"
* New-style: Invenio query syntax with "bracket syntax":
-q "773__p:\"[773__p]\" 100__a:[100__a]"
Depending on the structure of the query, it will fetch associated values from each record and put it into
the final search query. i.e in the above example it will put journal-title from 773__p.
When more then one value/datafield is found, i.e. when looking for 700__a (additional authors),
several queries will be put together to make sure all combinations of values are accounted for.
The queries are separated with given operator (-o, --operator) value.
Note: You can add more then one query to a search, just give more (-q, --query-string) arguments.
The results of all queries will be combined when matching.
BibConvert formats:
Another option to further improve your matching strategy is to use BibConvert formats. By using the formats
available by BibConvert you can change the values from the retrieved record-fields.
i.e. using WORDS(1,R) will only return the first (1) word from the right (R). This can be very useful when
adjusting your matching parameters to better match the content. For example only getting authors last-name
instead of full-name.
You can use these formats directly in the querystrings (indicated by '::'):
* Old-style: -q "100__a::WORDS(1,R)::DOWN()"
This query will take first word from the right from 100__a and also convert it to lower-case.
* New-style: -q "100__a:[100__a::WORDS(1,R)::DOWN()]"
See BibConvert documentation for a more detailed explanation of formats.
Predefined fields:
In addition to specifying distinct MARC fields in the querystrings you can use predefined
fields as configured in the LOCAL(!) Invenio system. These fields will then be mapped to one
or more fieldtags to be retrieved from input records.
Common predefined fields used in querystrings: (for Invenio demo site, your fields may vary!)
'abstract', 'affiliation', 'anyfield', 'author', 'coden', 'collaboration',
'collection', 'datecreated', 'datemodified', 'division', 'exactauthor', 'exactfirstauthor',
'experiment', 'fulltext', 'isbn', 'issn', 'journal', 'keyword', 'recid',
'reference', 'reportnumber', 'subject', 'title', 'year'
Examples:
$ bibmatch [options] < input.xml > unmatched.xml
$ bibmatch -b out -n < input.xml
$ bibmatch -a -1 < input.xml > modified_match.xml
$ bibmatch --field=title < input.xml
$ bibmatch --field=245__a --mode=a < input.xml
$ bibmatch --print-ambiguous -q title-author < input.xml > ambigmatched.xml
$ bibmatch -q "980:Thesis 773__p:\"[773__p]\" 100__a:[100__a]" -r "http://inspirebeta.net" < input.xml
$ bibmatch --collection 'Books,Articles' < input.xml
$ bibmatch --collection 'Theses' --user admin < input.xml
""" % (sys.argv[0],)
sys.exit(1)
return
class Querystring:
"""
Holds the information about a querystring.
The object contains lists of fields, formats and queries which generates search queries.
self.fields is a dict of found field-data {"tag": [list of found record data]}
self.formats is a dict of found BibConvert formats {"tag": [list of found format-values]}
self.pattern contains the original search string
self.query contains the generated query
self.operator holds the current active operator, upper-case (OR/AND)
To populate the Querystring instance with values and search string structure,
call create_query(..) with BibRecord structure and a query-string to populate with retrieved values.
Example: The template "title:[245__a]" will retrieve the value from subfield 245__a in
given record. If any BibConvert formats are specified for this field, these will
be applied.
"""
def __init__(self, operator="AND", clean=False, ascii_mode=False):
"""
Creates Querystring instance.
@param operator: operator used to concatenate several queries
@type operator: str
@param clean: indicates if queries should be sanitized
@type clean: bool
"""
self.fields = {}
self.operator = operator.upper()
self.pattern = ""
self.query = ""
self.clean = clean
self.ascii_mode = ascii_mode
self.formats = {}
def create_query(self, record, qrystr="[title]"):
"""
Main method that parses and generates a search query from
given query-string structure and record data. Returns the
resulting query-string and completeness determination as a tuple.
A query is 'complete' when all found field references has a value
in the passed record. Should a value be missing, the query is
incomplete.
@param record: bibrecord to retrive field-values from
@type record: dict
@param qrystr: proper query string template. (i.e. title:[245__a])
defaults to: [title]
@type qrystr: str
@return: (query-string, complete flag)
@rtype: tuple
"""
if qrystr == "":
qrystr = "[title]"
if "||" in qrystr or not "[" in qrystr:
# Assume old style query-strings
qrystr = self._convert_qrystr(qrystr)
# FIXME: Convert to lower case, since fuzzy_parser
# which treats everything lower-case, and will cause problems when
# retrieving data from the self.fields dict.
# Also BibConvert formats are currently case sensitive, so we cannot
# force lower-case yet.
self.pattern = qrystr.lower()
self.fields = {}
# Extract referenced field-values from given record
complete, fieldtags_found = self._extract_fieldvalues(record, qrystr)
# If no field references are found, we exit as empty query.
if len(self.fields) == 0:
self.query = ""
return self.query, False
# Now we assemble the found values into a proper search query
all_queries = []
operator_delimiter = " %s " % (self.operator,)
if self.operator == "AND":
# We gather all the values from the self.fields and put them
# in a list together with any prefix/suffix associated with the field.
new_query = self.pattern
for (field_prefix, field_reference, field_suffix), value_list in iteritems(self.fields):
new_values = []
for value in value_list:
new_values.append("%s%s%s" % (field_prefix, value, field_suffix))
new_query = new_query.replace("%s[%s]%s" % (field_prefix, field_reference, field_suffix), \
operator_delimiter.join(set(new_values)))
all_queries = [new_query]
else:
# operator is OR, which means a more elaborate approach to multi-value fields
field_tuples = []
for key, values in iteritems(self.fields):
field_list = []
for value in values:
# We add key here to be able to associate the value later
field_list.append((key, value))
field_tuples.append(field_list)
# Grab all combinations of queries
query_tuples = cproduct(field_tuples)
for query in query_tuples:
new_query = self.pattern
for (field_prefix, field_reference, field_suffix), value in query:
new_query = new_query.replace("%s[%s]%s" % (field_prefix, field_reference, field_suffix), \
"%s%s%s" % (field_prefix, value, field_suffix))
all_queries.append(new_query)
# Finally we concatenate all unique queries into one, delimited by chosen operator
self.query = operator_delimiter.join(set(all_queries))
if not complete:
# Clean away any leftover field-name references from query
for fieldtag in fieldtags_found:
self.query = self.query.replace("%s" % (fieldtag,), "")
# Clean query?
if self.clean:
self._clean_query()
return self.query, complete
def fuzzy_queries(self):
"""
Returns a list of queries that are built more 'fuzzily' using the main query as base.
The list returned also contains the current operator in context, so each query is a tuple
of (operator, query).
@return: list of tuples [(operator, query), ..]
@rtype: list [(str, str), ..]
"""
fuzzy_query_list = []
operator_delimiter = " %s " % (self.operator,)
parser = SearchQueryParenthesisedParser()
query_parts = parser.parse_query(self.pattern)
# Go through every expression in the query and generate fuzzy searches
for i in xrange(0, len(query_parts) - 1, 2):
current_operator = query_parts[i]
current_pattern = query_parts[i + 1]
fieldname_list = re_querystring.findall(current_pattern)
if fieldname_list == []:
# No reference to record value, add query 'as is'
fuzzy_query_list.append((current_operator, current_pattern))
else:
for field_prefix, field_reference, field_suffix in fieldname_list:
for value in self.fields.get((field_prefix, field_reference, field_suffix), []):
new_query = []
# Grab the x longest words in the string and perform boolean AND for each word
# x is determined by the configuration dict and is tag-based. Defaults to 3 words
word_list = get_longest_words(value, limit=CFG_BIBMATCH_FUZZY_WORDLIMITS.get(field_reference, 3))
for word in word_list:
# Create fuzzy query with key + word, including any surrounding elements like quotes, regexp etc.
new_query.append(current_pattern.replace("[%s]" % (field_reference,), word))
fuzzy_query_list.append((current_operator, operator_delimiter.join(new_query)))
# Return a list of unique queries
return list(set(fuzzy_query_list))
def _clean_query(self):
"""
This function will remove erroneous characters and combinations from
a the generated search query that might cause problems when searching.
@return: cleaned query
@rtype: str
"""
#FIXME: Extend cleaning to account for encodings and LaTeX symbols
query = self.query.replace("''", "")
query = query.replace('""', "")
return query
def _convert_qrystr(self, qrystr):
"""
Converts old-style query-strings into new-style.
"""
fields = qrystr.split("||")
converted_query = []
for field in fields:
converted_query.append("[%s]" % (field,))
return self.operator.join(converted_query)
def _extract_fieldvalues(self, record, qrystr):
"""
Extract all the values in the given record referenced in the given query-string
and attach them to self.fields as a list. Return boolean indicating if a query
is complete, and a list of all field references found.
Field references is checked to be valid MARC tag references and all values
found are added to self.fields as a list, hashed by the full reference including
prefix and suffix.
If ascii_mode is enabled, the record values will be translated to its ascii
representation.
e.g. for the query-string: 700__a:"[700__a]"
{ ('700__a:"', '700__a', '"') : ["Ellis, J.", "Olive, K. A."]}
Should no values be found for a field references, the query will be flagged
as incomplete.
@param record: bibrecord to retrive field-values from
@type record: dict
@param qrystr: proper query string template. (i.e. title:[245__a])
defaults to: [title]
@type qrystr: str
@return: complete flag, [field references found]
@rtype: tuple
"""
complete = True
fieldtags_found = []
# Find all potential references to record tag values and
# add to fields-dict as a list of values using field-name tuple as key.
#
# Each reference will be split into prefix, field-ref and suffix.
# Example:
# 773__p:"[773__p]" 100__a:/.*[100__a].*/ =>
# [('773__p:"', '773__p', '"'), ('100__a:/.*', '100__a', '.*/')]
for field_prefix, field_reference, field_suffix in re_querystring.findall(qrystr):
# First we see if there is any special formats for this field_reference
# The returned value from _extract_formats is the field-name stripped from formats.
# e.g. 245__a::SUP(NUM) => 245__a
fieldname = self._extract_formats(field_reference)
# We need everything in lower-case
field_prefix = field_prefix.lower()
field_suffix = field_suffix.lower()
# Find proper MARC tag(s) for the stripped field-name, if fieldname is used.
# e.g. author -> [100__a, 700__a]
# FIXME: Local instance only!
tag_list = get_field_tags_from_fieldname(fieldname)
if len(tag_list) == 0:
tag_list = [fieldname]
for field in tag_list:
# Check if it is really a reference to a tag to not confuse with e.g. regex syntax
tag_structure = validate_tag(field)
if tag_structure != None:
tag, ind1, ind2, code = tag_structure
value_list = record_get_field_values(record, tag, ind1, ind2, code)
if len(value_list) > 0:
# Apply any BibConvert formatting functions to each value
updated_value_list = self._apply_formats(fieldname, value_list)
# Also remove any errornous XML entities. I.e. &amp; -> &
updated_value_list = [xml_entities_to_utf8(v, skip=[]) \
for v in updated_value_list]
if self.ascii_mode:
updated_value_list = translate_to_ascii(updated_value_list)
# Store found values linked to full field reference tuple including
# (prefix, field, suffix)
self.fields[(field_prefix,
fieldname,
field_suffix)] = updated_value_list
else:
# No values found. The query is deemed incomplete
complete = False
fieldtags_found.append("%s[%s]%s" % (field_prefix, fieldname, field_suffix))
return complete, fieldtags_found
def _extract_formats(self, field_reference):
"""
Looks for BibConvert formats within query-strings and adds to
the instance. Formats are defined by one or more '::' followed
by a format keyword which is defined in BibConvert FormatField()
method.
The function also removes the references to formatting functions
in the query (self.pattern)
Returns the field_reference reference, with formats stripped.
"""
field_parts = field_reference.split("::")
if len(field_parts) > 1:
# Remove any references to BibConvert functions in pattern. e.g. 245__a::SUP(PUNCT, ) -> 245__a
# self.pattern is lower cased. Returned value is field-name stripped from formats.
for aformat in field_parts[1:]:
self.formats.setdefault(field_parts[0], []).append(aformat)
self.pattern = self.pattern.replace("[%s]" % (field_reference.lower(),), "[%s]" % (field_parts[0],))
return field_parts[0]
def _apply_formats(self, fieldname, value_list):
"""
Apply the current stored BibConvert formating operations for a
field-name to the given list of strings. The list is then returned.
@param fieldname: name of field - used as key in the formats dict
@type fieldname: string
@param value_list: list of strings to apply formats to
@type value_list: list
@return: list of values with formatting functions applied
@rtype: list
"""
if fieldname in self.formats:
new_list = []
for value in value_list:
if value.strip() != "":
# Apply BibConvert formats if applicable
for aformat in self.formats[fieldname]:
value = bibconvert.FormatField(value, aformat)
new_list.append(value)
return new_list
else:
return value_list
def get_field_tags_from_fieldname(field):
"""
Gets list of field 'field' for the record with 'sysno' system number from the database.
"""
query = "select tag.value from tag left join field_tag on tag.id=field_tag.id_tag " \
+ "left join field on field_tag.id_field=field.id where field.code='%s'" % (field,)
out = []
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def cproduct(args):
"""
Returns the Cartesian product of passed arguments as a list of tuples.
'12','34' -> ('1', '3'), ('1', '4'), ('2', '3'), ('2', '4')
@param args: iterable with elements to compute
@type args: iterable
@return list containing tuples for each computed combination
@rtype list of tuples
Based on http://docs.python.org/library/itertools.html#itertools.product
"""
values = map(tuple, args)
result = [[]]
for value in values:
result = [x + [y] for x in result for y in value]
return [tuple(res) for res in result]
def bylen(word1, word2):
""" Sort comparison method that compares by length """
return len(word1) - len(word2)
def get_longest_words(wstr, limit=5):
"""
Select the longest words for matching. It selects the longest words from
the string, according to a given limit of words. By default the 5 longest word are selected
@param wstr: string to extract the longest words from
@type wstr: str
@param limit: maximum number of words extracted
@type limit: int
@return: list of long words
@rtype: list
"""
words = []
if wstr:
words = wstr.split()
words.sort(cmp=bylen)
words.reverse()
words = words[:limit]
return words
def add_recid(record, recid):
"""
Add a given record-id to the record as $$001 controlfield. If an 001 field already
exists it will be replaced.
@param record: the record to retrive field-values from
@type record: a bibrecord instance
@param recid: record-id to be added
@type recid: int
"""
if record_has_field(record, '001'):
record_modify_controlfield(record, '001', \
controlfield_value=str(recid), \
field_position_global=1)
else:
record_add_field(record, '001', controlfield_value=str(recid))
def match_result_output(bibmatch_recid, recID_list, server_url, query, matchmode="no match"):
"""
Generates result as XML comments from passed record and matching parameters.
@param bibmatch_recid: BibMatch record identifier
@type bibmatch_recid: int
@param recID_list: record matched with record
@type recID_list: list
@param server_url: url to the server the matching has been performed
@type server_url: str
@param query: matching query
@type query: str
@param matchmode: matching type
@type matchmode: str
@rtype str
@return XML result string
"""
result = ["<!-- BibMatch-Matching-Results: -->", \
"<!-- BibMatch-Matching-Record-Identifier: %s -->" % (bibmatch_recid,)]
for recID in recID_list:
result.append("<!-- BibMatch-Matching-Found: %s/%s/%s -->" \
% (server_url, CFG_SITE_RECORD, recID))
result.append("<!-- BibMatch-Matching-Mode: %s -->" \
% (matchmode,))
result.append("<!-- BibMatch-Matching-Criteria: %s -->" \
% (query,))
return "\n".join(result)
def match_records(records, qrystrs=None, search_mode=None, operator="and", \
verbose=1, server_url=CFG_SITE_SECURE_URL, modify=0, \
sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \
clean=False, collections=[], user="", password="", \
fuzzy=True, validate=True, ascii_mode=False):
"""
Match passed records with existing records on a local or remote Invenio
installation. Returns which records are new (no match), which are matched,
which are ambiguous and which are fuzzy-matched. A formatted result of each
records matching are appended to each record tuple:
(record, status_code, list_of_errors, result)
@param records: records to analyze
@type records: list of records
@param qrystrs: list of tuples (field, querystring)
@type qrystrs: list
@param search_mode: if mode is given, the search will perform an advanced
query using the desired mode. Otherwise 'simple search'
is used.
@type search_mode: str
@param operator: operator used to concatenate values of fields occurring more then once.
Valid types are: AND, OR. Defaults to AND.
@type operator: str
@param verbose: be loud
@type verbose: int
@param server_url: which server to search on. Local installation by default
@type server_url: str
@param modify: output modified records of matches
@type modify: int
@param sleeptime: amount of time to wait between each query
@type sleeptime: float
@param clean: should the search queries be cleaned before passed them along?
@type clean: bool
@param collections: list of collections to search, if specified
@type collections: list
@param user: username in case of authenticated search requests
@type user: string
@param password: password in case of authenticated search requests
@type password: string
@param fuzzy: True to activate fuzzy query matching step
@type fuzzy: bool
@param validate: True to activate match validation
@type validate: bool
@param ascii_mode: True to transform values to its ascii representation
@type ascii_mode: bool
@rtype: list of lists
@return an array of arrays of records, like this [newrecs,matchedrecs,
ambiguousrecs,fuzzyrecs]
"""
newrecs = []
matchedrecs = []
ambiguousrecs = []
fuzzyrecs = []
CFG_BIBMATCH_LOGGER.info("-- BibMatch starting match of %d records --" % (len(records),))
try:
server = InvenioConnector(server_url, user=user, password=password)
except InvenioConnectorAuthError as error:
if verbose > 0:
sys.stderr.write("Authentication error when connecting to server: %s" \
% (str(error),))
CFG_BIBMATCH_LOGGER.info("-- BibMatch ending match with errors (AuthError) --")
return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
## Go through each record and try to find matches using defined querystrings
record_counter = 0
for record in records:
record_counter += 1
if (verbose > 1):
sys.stderr.write("\n Processing record: #%d .." % (record_counter,))
# At least one (field, querystring) tuple is needed for default search query
if not qrystrs:
qrystrs = [("", "")]
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Started" % (record_counter,))
[matched_results, ambiguous_results, fuzzy_results] = match_record(bibmatch_recid=record_counter,
record=record[0],
server=server,
qrystrs=qrystrs,
search_mode=search_mode,
operator=operator,
verbose=verbose,
sleeptime=sleeptime,
clean=clean,
collections=collections,
fuzzy=fuzzy,
validate=validate,
ascii_mode=ascii_mode)
## Evaluate final results for record
# Add matched record iff number found is equal to one, otherwise return fuzzy,
# ambiguous or no match
if len(matched_results) == 1:
results, query = matched_results[0]
# If one match, add it as exact match, otherwise ambiguous
if len(results) == 1:
if modify:
add_recid(record[0], results[0])
matchedrecs.append((record[0], match_result_output(record_counter, results, server_url, \
query, "exact-matched")))
if (verbose > 1):
sys.stderr.write("Final result: match - %s/record/%d\n" % (server_url, results[0]))
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Completed as 'match'" % (record_counter,))
else:
ambiguousrecs.append((record[0], match_result_output(record_counter, results, server_url, \
query, "ambiguous-matched")))
if (verbose > 1):
sys.stderr.write("Final result: ambiguous\n")
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Completed as 'ambiguous'" % (record_counter,))
else:
if len(fuzzy_results) > 0:
# Find common record-id for all fuzzy results and grab first query
# as "representative" query
query = fuzzy_results[0][1]
result_lists = []
for res, dummy in fuzzy_results:
result_lists.extend(res)
results = set([res for res in result_lists])
if len(results) == 1:
fuzzyrecs.append((record[0], match_result_output(record_counter, results, server_url, \
query, "fuzzy-matched")))
if (verbose > 1):
sys.stderr.write("Final result: fuzzy\n")
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Completed as 'fuzzy'" % (record_counter,))
else:
ambiguousrecs.append((record[0], match_result_output(record_counter, results, server_url, \
query, "ambiguous-matched")))
if (verbose > 1):
sys.stderr.write("Final result: ambiguous\n")
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Completed as 'ambiguous'" % (record_counter,))
elif len(ambiguous_results) > 0:
# Find common record-id for all ambiguous results and grab first query
# as "representative" query
query = ambiguous_results[0][1]
result_lists = []
for res, dummy in ambiguous_results:
result_lists.extend(res)
results = set([res for res in result_lists])
ambiguousrecs.append((record[0], match_result_output(record_counter, results, server_url, \
query, "ambiguous-matched")))
if (verbose > 1):
sys.stderr.write("Final result: ambiguous\n")
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Completed as 'ambiguous'" % (record_counter,))
else:
newrecs.append((record[0], match_result_output(record_counter, [], server_url, str(qrystrs))))
if (verbose > 1):
sys.stderr.write("Final result: new\n")
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Completed as 'new'" % (record_counter,))
CFG_BIBMATCH_LOGGER.info("-- BibMatch ending match: New(%d), Matched(%d), Ambiguous(%d), Fuzzy(%d) --" % \
(len(newrecs), len(matchedrecs), len(ambiguousrecs), len(fuzzyrecs)))
return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
def match_record(bibmatch_recid, record, server, qrystrs=None, search_mode=None, operator="and", \
verbose=1, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \
clean=False, collections=[], fuzzy=True, validate=True, \
ascii_mode=False):
"""
Matches a single record.
@param bibmatch_recid: Current record number. Used for logging.
@type bibmatch_recid: int
@param record: record to match in BibRecord structure
@type record: dict
@param server: InvenioConnector server object
@type server: object
@param qrystrs: list of tuples (field, querystring)
@type qrystrs: list
@param search_mode: if mode is given, the search will perform an advanced
query using the desired mode. Otherwise 'simple search'
is used.
@type search_mode: str
@param operator: operator used to concatenate values of fields occurring more then once.
Valid types are: AND, OR. Defaults to AND.
@type operator: str
@param verbose: be loud
@type verbose: int
@param server_url: which server to search on. Local installation by default
@type server_url: str
@param sleeptime: amount of time to wait between each query
@type sleeptime: float
@param clean: should the search queries be cleaned before passed them along?
@type clean: bool
@param collections: list of collections to search, if specified
@type collections: list
@param fuzzy: True to activate fuzzy query matching step
@type fuzzy: bool
@param validate: True to activate match validation
@type validate: bool
@param ascii_mode: True to transform values to its ascii representation
@type ascii_mode: bool
"""
matched_results = []
ambiguous_results = []
fuzzy_results = []
# Keep a list of generated querystring objects for later use in fuzzy match
query_list = []
# Go through each querystring, trying to find a matching record
# Stops on first valid match, if no exact-match we continue with fuzzy match
for field, qrystr in qrystrs:
querystring = Querystring(operator, clean=clean, ascii_mode=ascii_mode)
query, complete = querystring.create_query(record, qrystr)
if query == "":
if (verbose > 1):
sys.stderr.write("\nEmpty query. Skipping...\n")
# Empty query, no point searching database
continue
query_list.append((querystring, complete, field))
if not complete:
if (verbose > 1):
sys.stderr.write("\nQuery not complete. Flagged as uncertain/ambiguous...\n")
# Determine proper search parameters
if search_mode != None:
search_params = dict(p1=query, f1=field, m1=search_mode, of='id', c=collections)
else:
search_params = dict(p=query, f=field, of='id', c=collections)
if (verbose > 8):
sys.stderr.write("\nSearching with values %s\n" %
(search_params,))
## Perform the search with retries
try:
result_recids = server.search_with_retry(**search_params)
except InvenioConnectorAuthError as error:
if verbose > 0:
sys.stderr.write("Authentication error when searching: %s" \
% (str(error),))
break
sleep(sleeptime)
## Check results:
if len(result_recids) > 0:
# Matches detected
if len(result_recids) > CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT:
# Too many matches, treat as non-match
if (verbose > 8):
sys.stderr.write("result=More then %d results...\n" % \
(CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT,))
continue
if (verbose > 8):
sys.stderr.write("result=%s\n" % (result_recids,))
if validate:
# Validation can be run
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Query (%s) found %d records: %s" % \
(bibmatch_recid,
query,
len(result_recids),
str(result_recids)))
exact_matches, fuzzy_matches = validate_matches(bibmatch_recid=bibmatch_recid, \
record=record, \
server=server, \
result_recids=result_recids, \
collections=collections, \
verbose=verbose, \
ascii_mode=ascii_mode)
if len(exact_matches) > 0:
if (verbose > 8):
sys.stderr.write("Match validated\n")
matched_results.append((exact_matches, query))
break
elif len(fuzzy_matches) > 0:
if (verbose > 8):
sys.stderr.write("Match validated fuzzily\n")
fuzzy_results.append((fuzzy_matches, query))
continue
else:
if (verbose > 8):
sys.stderr.write("Match could not be validated\n")
else:
# No validation
# Ambiguous match
if len(result_recids) > 1:
ambiguous_results.append((result_recids, query))
if (verbose > 8):
sys.stderr.write("Ambiguous\n")
continue
# Match
elif len(result_recids) == 1:
if complete:
matched_results.append((result_recids, query))
if (verbose > 8):
sys.stderr.write("Match\n")
# This was a complete match, so let's break out to avoid more searching
break
else:
# We treat the result as ambiguous (uncertain) when query is not complete
# and we are not validating it.
ambiguous_results.append((result_recids, query))
if (verbose > 8):
sys.stderr.write("Ambiguous\n")
continue
# No match
if (verbose > 8):
sys.stderr.write("result=No matches\n")
# No complete matches, lets try fuzzy matching of all the queries
else:
if fuzzy:
if (verbose > 8):
sys.stderr.write("\nFuzzy query mode...\n")
## Fuzzy matching: Analyze all queries and perform individual searches, then intersect results.
for querystring, complete, field in query_list:
result_hitset = None
fuzzy_query_list = querystring.fuzzy_queries()
empty_results = 0
# Go through every expression in the query and generate fuzzy searches
for current_operator, qry in fuzzy_query_list:
current_resultset = None
search_params = dict(p=qry, f=field, of='id', c=collections)
try:
current_resultset = server.search_with_retry(**search_params)
except InvenioConnectorAuthError as error:
if (verbose > 0):
sys.stderr.write("Authentication error when searching: %s" \
% (str(error),))
break
if (verbose > 8):
if len(current_resultset) > CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT:
sys.stderr.write("\nSearching with values %s result=%s\n" %
(search_params, "More then %d results..." % \
(CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT,)))
else:
sys.stderr.write("\nSearching with values %s result=%s\n" %
(search_params, current_resultset))
sleep(sleeptime)
if current_resultset == None:
continue
if current_resultset == [] and empty_results < CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT:
# Allows some empty results
empty_results += 1
else:
# Intersect results with previous results depending on current operator
if result_hitset == None:
result_hitset = current_resultset
if current_operator == '+':
result_hitset = list(set(result_hitset) & set(current_resultset))
elif current_operator == '-':
result_hitset = list(set(result_hitset) - set(current_resultset))
elif current_operator == '|':
result_hitset = list(set(result_hitset) | set(current_resultset))
else:
if result_hitset and len(result_hitset) < CFG_BIBMATCH_SEARCH_RESULT_MATCH_LIMIT:
# This was a fuzzy match
query_out = " ".join(["%s %s" % (op, qu) for op, qu in fuzzy_query_list])
if validate:
# We can run validation
CFG_BIBMATCH_LOGGER.info("Matching of record %d: Query (%s) found %d records: %s" % \
(bibmatch_recid,
query_out,
len(result_hitset),
str(result_hitset)))
exact_matches, fuzzy_matches = validate_matches(bibmatch_recid=bibmatch_recid, \
record=record, \
server=server, \
result_recids=result_hitset, \
collections=collections, \
verbose=verbose, \
ascii_mode=ascii_mode)
if len(exact_matches) > 0:
if (verbose > 8):
sys.stderr.write("Match validated\n")
matched_results.append((exact_matches, query_out))
break
elif len(fuzzy_matches) > 0:
if (verbose > 8):
sys.stderr.write("Match validated fuzzily\n")
fuzzy_results.append((fuzzy_matches, query_out))
else:
if (verbose > 8):
sys.stderr.write("Match could not be validated\n")
else:
# No validation
if len(result_hitset) == 1 and complete:
fuzzy_results.append((result_hitset, query_out))
if (verbose > 8):
sys.stderr.write("Fuzzy: %s\n" % (result_hitset,))
else:
# We treat the result as ambiguous (uncertain) when:
# - query is not complete
# - more then one result
ambiguous_results.append((result_hitset, query_out))
if (verbose > 8):
sys.stderr.write("Ambiguous\n")
return [matched_results, ambiguous_results, fuzzy_results]
def transform_input_to_marcxml(filename=None, file_input=""):
"""
Takes the filename or input of text-marc and transforms it
to MARCXML.
"""
if not filename:
# Create temporary file to read from
tmp_fd, filename = mkstemp()
os.write(tmp_fd, file_input)
os.close(tmp_fd)
try:
# Redirect output, transform, restore old references
old_stdout = sys.stdout
new_stdout = StringIO()
sys.stdout = new_stdout
transform_file(filename)
finally:
sys.stdout = old_stdout
return new_stdout.getvalue()
def bibrecs_has_errors(bibrecs):
"""
Utility function to check a list of parsed BibRec objects, directly
from the output of bibrecord.create_records(), for any
badly parsed records.
If an error-code is present in the result the function will return True,
otherwise False.
"""
return 0 in [err_code for dummy, err_code, dummy2 in bibrecs]
def main():
"""
Record matches database content when defined search gives
exactly one record in the result set. By default the match is
done on the title field.
"""
try:
opts, args = getopt.getopt(sys.argv[1:], "0123hVm:fq:c:nv:o:b:i:r:tazx:",
[
"print-new",
"print-match",
"print-ambiguous",
"print-fuzzy",
"help",
"version",
"mode=",
"field=",
"query-string=",
"config=",
"no-process",
"verbose=",
"operator=",
"batch-output=",
"input=",
"remote=",
"text-marc-output",
"alter-recid",
"clean",
"collection=",
"user=",
"no-fuzzy",
"no-validation",
"ascii"
])
except getopt.GetoptError as e:
usage()
match_results = []
qrystrs = [] # list of query strings
print_mode = 0 # default match mode to print new records
noprocess = 0 # dump result in stdout?
operator = "and"
verbose = 1 # 0..be quiet
records = []
batch_output = "" # print stuff in files
f_input = "" # read from where, if param "i"
server_url = CFG_SITE_SECURE_URL # url to server performing search, local by default
modify = 0 # alter output with matched record identifiers
textmarc_output = 0 # output in MARC instead of MARCXML
field = ""
search_mode = None # activates a mode, uses advanced search instead of simple
sleeptime = CFG_BIBMATCH_LOCAL_SLEEPTIME # the amount of time to sleep between queries, changes on remote queries
clean = False # should queries be sanitized?
collections = [] # only search certain collections?
user = ""
password = ""
validate = True # should matches be validate?
fuzzy = True # Activate fuzzy-mode if no matches found for a record
ascii_mode = False # Should values be turned into ascii mode
for opt, opt_value in opts:
if opt in ["-0", "--print-new"]:
print_mode = 0
if opt in ["-1", "--print-match"]:
print_mode = 1
if opt in ["-2", "--print-ambiguous"]:
print_mode = 2
if opt in ["-3", "--print-fuzzy"]:
print_mode = 3
if opt in ["-n", "--no-process"]:
noprocess = 1
if opt in ["-h", "--help"]:
usage()
sys.exit(0)
if opt in ["-V", "--version"]:
print __revision__
sys.exit(0)
if opt in ["-t", "--text-marc-output"]:
textmarc_output = 1
if opt in ["-v", "--verbose"]:
verbose = int(opt_value)
if opt in ["-f", "--field"]:
if opt_value in get_fieldcodes():
field = opt_value
if opt in ["-q", "--query-string"]:
try:
template = CFG_BIBMATCH_QUERY_TEMPLATES[opt_value]
qrystrs.append((field, template))
except KeyError:
qrystrs.append((field, opt_value))
if opt in ["-m", "--mode"]:
search_mode = opt_value
if opt in ["-o", "--operator"]:
if opt_value.lower() in ["o", "or", "|"]:
operator = "or"
elif opt_value.lower() in ["a", "and", "&"]:
operator = "and"
if opt in ["-b", "--batch-output"]:
batch_output = opt_value
if opt in ["-i", "--input"]:
f_input = opt_value
if opt in ["-r", "--remote"]:
server_url = opt_value
sleeptime = CFG_BIBMATCH_REMOTE_SLEEPTIME
if opt in ["-a", "--alter-recid"]:
modify = 1
if opt in ["-z", "--clean"]:
clean = True
if opt in ["-c", "--config"]:
config_file = opt_value
config_file_read = bibconvert.read_file(config_file, 0)
for line in config_file_read:
tmp = line.split("---")
if(tmp[0] == "QRYSTR"):
qrystrs.append((field, tmp[1]))
if opt in ["-x", "--collection"]:
colls = opt_value.split(',')
for collection in colls:
if collection not in collections:
collections.append(collection)
if opt in ["--user"]:
user = opt_value
password = getpass.getpass()
if opt == "--no-fuzzy":
fuzzy = False
if opt == "--no-validation":
validate = False
if opt == "--ascii":
ascii_mode = True
if verbose:
sys.stderr.write("\nBibMatch: Parsing input file %s..." % (f_input,))
read_list = []
if not f_input:
for line_in in sys.stdin:
read_list.append(line_in)
else:
f = open(f_input)
for line_in in f:
read_list.append(line_in)
f.close()
file_read = "".join(read_list)
# Detect input type
if not file_read.strip().startswith('<'):
# Not xml, assume type textmarc
file_read = transform_input_to_marcxml(f_input, file_read)
records = create_records(file_read)
if len(records) == 0:
if verbose:
sys.stderr.write("\nBibMatch: Input file contains no records.\n")
sys.exit(1)
# Check for any parsing errors in records
if bibrecs_has_errors(records):
# Errors found. Let's try to remove any XML entities
if verbose > 8:
sys.stderr.write("\nBibMatch: Parsing error. Trying removal of XML entities..\n")
file_read = xml_entities_to_utf8(file_read)
records = create_records(file_read)
if bibrecs_has_errors(records):
# Still problems.. alert the user and exit
if verbose:
errors = "\n".join([str(err_msg) for dummy, err_code, err_msg in records \
if err_code == 0])
sys.stderr.write("\nBibMatch: Errors during record parsing:\n%s\n" % \
(errors,))
sys.exit(1)
if verbose:
sys.stderr.write("read %d records" % (len(records),))
sys.stderr.write("\nBibMatch: Matching ...")
if not validate:
if verbose:
sys.stderr.write("\nWARNING: Skipping match validation.\n")
match_results = match_records(records=records,
qrystrs=qrystrs,
search_mode=search_mode,
operator=operator,
verbose=verbose,
server_url=server_url,
modify=modify,
sleeptime=sleeptime,
clean=clean,
collections=collections,
user=user,
password=password,
fuzzy=fuzzy,
validate=validate,
ascii_mode=ascii_mode)
# set the output according to print..
# 0-newrecs 1-matchedrecs 2-ambiguousrecs 3-fuzzyrecs
recs_out = match_results[print_mode]
if verbose:
sys.stderr.write("\n\n Bibmatch report\n")
sys.stderr.write("=" * 35)
sys.stderr.write("\n New records : %d" % (len(match_results[0]),))
sys.stderr.write("\n Matched records : %d" % (len(match_results[1]),))
sys.stderr.write("\n Ambiguous records : %d" % (len(match_results[2]),))
sys.stderr.write("\n Fuzzy records : %d\n" % (len(match_results[3]),))
sys.stderr.write("=" * 35)
sys.stderr.write("\n Total records : %d\n" % (len(records),))
sys.stderr.write("\n See detailed log at %s\n" % (CFG_LOGFILE,))
if not noprocess and recs_out:
print '<collection xmlns="http://www.loc.gov/MARC21/slim">'
for record, results in recs_out:
if textmarc_output:
# FIXME: textmarc output does not print matching results
print transform_record_to_marc(record)
else:
print results
print record_xml_output(record)
print "</collection>"
if batch_output:
i = 0
outputs = ['new', 'matched', 'ambiguous', 'fuzzy']
for result in match_results:
out = []
out.append('<collection xmlns="http://www.loc.gov/MARC21/slim">')
for record, results in result:
if textmarc_output:
# FIXME: textmarc output does not print matching results
out.append(transform_record_to_marc(record))
else:
out.append(results)
out.append(record_xml_output(record))
out.append("</collection>")
filename = "%s.%s.xml" % (batch_output, outputs[i])
file_fd = open(filename, "w")
file_fd.write("\n".join(out))
file_fd.close()
i += 1

Event Timeline