Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F110911634
bibmatch_validator.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Apr 28, 18:45
Size
36 KB
Mime Type
text/x-python
Expires
Wed, Apr 30, 18:45 (2 d)
Engine
blob
Format
Raw Data
Handle
25784506
Attached To
R3600 invenio-infoscience
bibmatch_validator.py
View Options
## This file is part of Invenio.
## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibMatch - tool to match records with database content of
an Invenio instance, either locally or remotely.
bibmatch_validator - module containing functions for match validation step
"""
__revision__
=
"$Id$"
import
re
import
sys
import
pprint
import
difflib
from
invenio.config
import
CFG_BIBMATCH_MATCH_VALIDATION_RULESETS
,
\
CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT
from
invenio.bibmatch_config
import
CFG_BIBMATCH_VALIDATION_MATCHING_MODES
,
\
CFG_BIBMATCH_VALIDATION_RESULT_MODES
,
\
CFG_BIBMATCH_VALIDATION_COMPARISON_MODES
,
\
CFG_BIBMATCH_LOGGER
from
invenio.bibrecord
import
create_records
,
record_get_field_values
from
invenio.xmlmarc2textmarc
import
get_sysno_from_record
,
create_marc_record
from
invenio.bibauthorid_name_utils
import
compare_names
from
invenio.bibauthorid_name_utils
import
string_partition
from
invenio.textutils
import
translate_to_ascii
re_valid_tag
=
re
.
compile
(
"^[0-9]{3}[a-zA-Z0-9_%]{0,3}$"
)
def
validate_matches
(
bibmatch_recid
,
record
,
server
,
result_recids
,
\
collections
=
""
,
verbose
=
0
,
ascii_mode
=
False
):
"""
Perform record validation on a set of matches. This function will
try to find any search-result that "really" is a correct match, based on
various methods defined in a given rule-set. See more about rule-sets in
validate_match() function documentation.
This function will return a tuple containing a list of all record IDs
satisfying the count of field matching needed for exact matches and a
similar list for fuzzy matches that has less fields matching then the
threshold. Records that are not matching at all are simply left out of
the lists.
@param bibmatch_recid: Current record number. Used for logging.
@type bibmatch_recid: int
@param record: bibrec structure of original record
@type record: dict
@param server: InvenioConnector object to matched record source repository
@type server: InvenioConnector object
@param result_recids: the list of record ids from search result.
@type result_recids: list
@param collections: list of collections to search, if specified
@type collections: list
@param verbose: be loud
@type verbose: int
@param ascii_mode: True to transform values to its ascii representation
@type ascii_mode: bool
@return: list of record IDs matched
@rtype: list
"""
matches_found
=
[]
fuzzy_matches_found
=
[]
# Generate final rule-set by analyzing the record
final_ruleset
=
get_validation_ruleset
(
record
)
if
not
final_ruleset
:
sys
.
stderr
.
write
(
"Bad configuration rule-set.
\
Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS is formed correctly.
\n
"
)
return
[],
[]
if
verbose
>
8
:
sys
.
stderr
.
write
(
"
\n
Start record validation:
\n\n
Final validation ruleset used:
\n
"
)
pp
=
pprint
.
PrettyPrinter
(
stream
=
sys
.
stderr
,
indent
=
2
)
pp
.
pprint
(
final_ruleset
)
# Fetch all records in MARCXML and convert to BibRec
found_record_list
=
[]
for
recid
in
result_recids
:
query
=
"001:
%d
"
%
(
recid
,)
if
collections
:
search_params
=
dict
(
p
=
query
,
of
=
"xm"
,
c
=
collections
)
else
:
search_params
=
dict
(
p
=
query
,
of
=
"xm"
)
result_marcxml
=
server
.
search_with_retry
(
**
search_params
)
result_record_list
=
create_records
(
result_marcxml
)
# Check if record was found and BibRecord generation was successful
if
result_record_list
==
[]
or
\
len
(
result_record_list
)
!=
1
or
\
result_record_list
[
0
]
==
None
:
# Error fetching a record. Unable to validate and returning with empty list.
if
verbose
>
8
:
sys
.
stderr
.
write
(
"
\n
Error retrieving MARCXML for matched record
%s
\n
"
%
(
str
(
recid
),))
return
[],
[]
# Add a tuple of record ID (for easy look-up later) and BibRecord structure
found_record_list
.
append
((
recid
,
result_record_list
[
0
][
0
]))
# Validate records one-by-one, adding any matches to the list of matching record IDs
current_index
=
1
for
recid
,
matched_record
in
found_record_list
:
if
verbose
>
8
:
sys
.
stderr
.
write
(
"
\n
Validating matched record #
%d
(
%s
):
\n
"
%
\
(
current_index
,
recid
))
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Comparing to matched record
%s
"
%
\
(
bibmatch_recid
,
recid
))
match_ratio
=
validate_match
(
record
,
matched_record
,
final_ruleset
,
\
verbose
,
ascii_mode
)
if
match_ratio
==
1.0
:
# All matches were a success, this is an exact match
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Exact match found ->
%s
"
%
(
bibmatch_recid
,
recid
))
matches_found
.
append
(
recid
)
elif
match_ratio
>=
CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT
:
# This means that some matches failed, but some succeeded as well. That's fuzzy...
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Fuzzy match found ->
%s
"
%
\
(
bibmatch_recid
,
recid
))
fuzzy_matches_found
.
append
(
recid
)
else
:
CFG_BIBMATCH_LOGGER
.
info
(
"Matching of record
%d
: Not a match"
%
(
bibmatch_recid
,))
current_index
+=
1
# Return list of matching record IDs
return
matches_found
,
fuzzy_matches_found
def
validate_match
(
org_record
,
matched_record
,
ruleset
,
verbose
=
0
,
ascii_mode
=
False
):
"""
This function will try to match the original record with matched record.
This comparison uses various methods defined in configuration and/or
determined from the source record.
These methods can be derived from each rule-set defined, which contains a
mapping of a certain pattern to a list of rules defining the "match-strategy".
For example:
('260__', [{ 'tags' : '260__c',
'threshold' : 0.8,
'compare_mode' : 'lazy',
'match_mode' : 'date',
'result_mode' : 'normal' }])
Quick run-down of possible values:
Compare mode:
'strict' : all (sub-)fields are compared, and all must match. Order is significant.
'normal' : all (sub-)fields are compared, and all must match. Order is ignored.
'lazy' : all (sub-)fields are compared with each other and at least one must match
'ignored' : the tag is ignored in the match. Used to disable previously defined rules.
Match mode:
'title' : uses a method specialized for comparing titles, e.g. looking for subtitles
'author' : uses a special authorname comparison. Will take initials into account.
'identifier': special matching for identifiers, stripping away punctuation
'date' : matches dates by extracting and comparing the year
'normal' : normal string comparison.
Result mode:
'normal' : a failed match will cause the validation to continue on other rules (if any)
a successful match will cause the validation to continue on other rules (if any)
'final' : a failed match will cause the validation to immediately exit as a failure.
a successful match will cause validation to immediately exit as a success.
'joker' : a failed match will cause the validation to continue on other rules (if any).
a successful match will cause validation to immediately exit as a success.
Fields are considered matching when all its subfields or values match. ALL matching strategy
must return successfully for a match to be validated (except for 'joker' mode).
@param org_record: bibrec structure of original record
@type org_record: dict
@param matched_record: bibrec structure of matched record
@type matched_record: dict
@param ruleset: the default rule-set {tag: strategy,..} used when validating
@type ruleset: dict
@param verbose: be loud
@type verbose: int
@param ascii_mode: True to transform values to its ascii representation
@type ascii_mode: bool
@return: Number of matches succeeded divided by number of comparisons done. At least two
successful matches must be done unless a joker or final match is found
@rtype: float
"""
total_number_of_matches
=
0
total_number_of_comparisons
=
0
for
field_tags
,
threshold
,
compare_mode
,
match_mode
,
result_mode
in
ruleset
:
field_tag_list
=
field_tags
.
split
(
','
)
if
verbose
>
8
:
sys
.
stderr
.
write
(
"
\n
Validating tags:
%s
in parsing mode '
%s
' and comparison
\
mode '
%s
' as '
%s
' result with threshold
%0.2f
\n
"
\
%
(
field_tag_list
,
compare_mode
,
match_mode
,
\
result_mode
,
threshold
))
current_matching_status
=
False
## 1. COMPARE MODE
# Fetch defined fields from both records
original_record_values
=
[]
matched_record_values
=
[]
for
field_tag
in
field_tag_list
:
tag_structure
=
validate_tag
(
field_tag
)
if
tag_structure
!=
None
:
tag
,
ind1
,
ind2
,
code
=
tag_structure
# Fetch all field instances to match
original_record_values
.
extend
(
record_get_field_values
(
\
org_record
,
tag
,
ind1
,
ind2
,
code
))
matched_record_values
.
extend
(
record_get_field_values
(
\
matched_record
,
tag
,
ind1
,
ind2
,
code
))
if
(
len
(
original_record_values
)
==
0
or
len
(
matched_record_values
)
==
0
):
# Any or both records do not have values, ignore.
if
verbose
>
8
:
sys
.
stderr
.
write
(
"
\n
Both records do not have this field. Continue.
\n
"
)
continue
if
ascii_mode
:
original_record_values
=
translate_to_ascii
(
original_record_values
)
matched_record_values
=
translate_to_ascii
(
matched_record_values
)
ignore_order
=
True
matches_needed
=
0
# How many field-value matches are needed for successful validation of this record
if
compare_mode
==
'lazy'
:
# 'lazy' : all fields are matched with each other, if any match = success
matches_needed
=
1
elif
compare_mode
==
'normal'
:
# 'normal' : all fields are compared, and all must match.
# Order is ignored. The number of matches needed is equal
# to the value count of original record
matches_needed
=
len
(
original_record_values
)
elif
compare_mode
==
'strict'
:
# 'strict' : all fields are compared, and all must match. Order matters.
if
len
(
original_record_values
)
!=
len
(
matched_record_values
):
# Not the same number of fields, not a valid match
# Unless this is a joker, we return indicating failure
if
result_mode
!=
'joker'
:
return
0.0
continue
matches_needed
=
len
(
original_record_values
)
ignore_order
=
False
if
verbose
>
8
:
sys
.
stderr
.
write
(
"Total matches needed:
%d
-> "
%
(
matches_needed
,))
## 2. MATCH MODE
total_number_of_comparisons
+=
1
comparison_function
=
None
if
match_mode
==
'title'
:
# Special title mode
comparison_function
=
compare_fieldvalues_title
elif
match_mode
==
'author'
:
# Special author mode
comparison_function
=
compare_fieldvalues_authorname
elif
match_mode
==
'identifier'
:
# Special identifier mode
comparison_function
=
compare_fieldvalues_identifier
elif
match_mode
==
'date'
:
# Special identifier mode
comparison_function
=
compare_fieldvalues_date
else
:
# Normal mode
comparison_function
=
compare_fieldvalues_normal
# Get list of comparisons to perform containing extracted values
field_comparisons
=
get_paired_comparisons
(
original_record_values
,
\
matched_record_values
,
\
ignore_order
)
if
verbose
>
8
:
sys
.
stderr
.
write
(
"Field comparison values:
\n
%s
\n
"
%
(
field_comparisons
,))
# Run comparisons according to match_mode
current_matching_status
,
matches
=
comparison_function
(
field_comparisons
,
\
threshold
,
\
matches_needed
)
CFG_BIBMATCH_LOGGER
.
info
(
"-- Comparing fields
%s
with
%s
=
%d
matches of
%d
"
%
\
(
str
(
original_record_values
),
\
str
(
matched_record_values
),
\
matches
,
matches_needed
))
## 3. RESULT MODE
if
current_matching_status
:
if
verbose
>
8
:
sys
.
stderr
.
write
(
"Fields matched successfully.
\n
"
)
if
result_mode
in
[
'final'
,
'joker'
]:
# Matching success. Return 5,5 indicating exact-match when final or joker.
return
1.0
total_number_of_matches
+=
1
else
:
# Matching failed. Not a valid match
if
result_mode
==
'final'
:
# Final does not allow failure
return
0.0
elif
result_mode
==
'joker'
:
# Jokers looks count as a match even if its not
total_number_of_matches
+=
1
if
verbose
>
8
:
sys
.
stderr
.
write
(
"Fields not matching. (Joker)
\n
"
)
else
:
if
verbose
>
8
:
sys
.
stderr
.
write
(
"Fields not matching.
\n
"
)
if
total_number_of_matches
<
2
or
total_number_of_comparisons
==
0
:
return
0.0
return
total_number_of_matches
/
float
(
total_number_of_comparisons
)
def
transform_record_to_marc
(
record
,
options
=
{
'text-marc'
:
1
,
'aleph-marc'
:
0
}):
""" This function will transform a given bibrec record into marc using
methods from xmlmarc2textmarc in invenio.textutils. The function returns the
record as a MARC string.
@param record: bibrec structure for record to transform
@type record: dict
@param options: dictionary describing type of MARC record. Defaults to textmarc.
@type options: dict
@return resulting MARC record as string """
sysno
=
get_sysno_from_record
(
record
,
options
)
# Note: Record dict is copied as create_marc_record() perform deletions
return
create_marc_record
(
record
.
copy
(),
sysno
,
options
)
def
compare_fieldvalues_normal
(
field_comparisons
,
threshold
,
matches_needed
):
"""
Performs field validation given an list of field comparisons using a standard
normalized string distance metric. Each comparison is done according to given
threshold which the normalized result must be equal or above to match.
Before the values are compared they will be massaged by putting all values
lower-case and any leading/trailing spaces are removed.
During validation the fields are compared and matches are counted per
field, up to the given amount of matches needed is met, causing the
function to return True. If validation ends before this threshold is met
it will return False.
@param field_comparisons: list of comparisons, each which contains a list
of field-value to field-value comparisons.
@type field_comparisons: list
@param threshold: number describing the match threshold a comparison must
exceed to become a positive match.
@type threshold: float
@param matches_needed: number of positive field matches needed for the entire
comparison process to give a positive result.
@type matches_needed: int
@return: tuple of matching result, True if enough matches are found, False if not,
and number of matches.
@rtype: tuple
"""
matches_found
=
0
# Loop over all possible comparisons field by field, if a match is found,
# we are done with this field and break out to try and match next field.
for
comparisons
in
field_comparisons
:
for
value
,
other_value
in
comparisons
:
# Value matching - put values in lower case and strip leading/trailing spaces
diff
=
difflib
.
SequenceMatcher
(
None
,
value
.
lower
()
.
strip
(),
\
other_value
.
lower
()
.
strip
())
.
ratio
()
if
diff
>=
threshold
:
matches_found
+=
1
break
# If we already have found required number of matches, we return immediately
if
matches_found
>=
matches_needed
:
return
True
,
matches_found
return
matches_found
>=
matches_needed
,
matches_found
def
compare_fieldvalues_authorname
(
field_comparisons
,
threshold
,
matches_needed
):
"""
Performs field validation given an list of field comparisons using a technique
that is meant for author-names taking into account initials vs. full-name,
using matching techniques available from BibAuthorId.
Each comparison is done according to given threshold which the result must
be equal or above to match.
During validation the fields are compared and matches are counted per
field, up to the given amount of matches needed is met, causing the
function to return True. If validation ends before this threshold is met
it will return False.
@param field_comparisons: list of comparisons, each which contains a list
of field-value to field-value comparisons.
@type field_comparisons: list
@param threshold: number describing the match threshold a comparison must
exceed to become a positive match.
@type threshold: float
@param matches_needed: number of positive field matches needed for the entire
comparison process to give a positive result.
@type matches_needed: int
@return: tuple of matching result, True if enough matches are found, False if not,
and number of matches.
@rtype: tuple
"""
matches_found
=
0
# Loop over all possible comparisons field by field, if a match is found,
# we are done with this field and break out to try and match next field.
for
comparisons
in
field_comparisons
:
for
value
,
other_value
in
comparisons
:
# Grab both permutations of a name (before, after and after, before)
# and compare to each unique commutative combination. Ex:
# Doe,J vs. Smith,J -> [(('Smith,J', 'Doe,J'), ('Smith,J', 'J,Doe')),
# (('J,Smith', 'Doe,J'), ('J,Smith', 'J,Doe'))]
author_comparisons
=
[
pair
for
pair
in
get_paired_comparisons
(
\
get_reversed_string_variants
(
value
),
\
get_reversed_string_variants
(
other_value
))][
0
]
for
str1
,
str2
in
author_comparisons
:
# Author-name comparison - using BibAuthorid function
diff
=
compare_names
(
str1
,
str2
)
if
diff
>=
threshold
:
matches_found
+=
1
break
else
:
# We continue as no match was found
continue
# We break out as a match was found
break
# If we already have found required number of matches, we return immediately
if
matches_found
>=
matches_needed
:
return
True
,
matches_found
# Often authors are not matching fully, so lets allow for the number of matches to
# be a little lower, using the same threshold
result
=
matches_found
>=
matches_needed
or
matches_found
/
float
(
matches_needed
)
>
threshold
return
result
,
matches_found
def
compare_fieldvalues_identifier
(
field_comparisons
,
threshold
,
matches_needed
):
"""
Performs field validation given an list of field comparisons using a method to
normalize identifiers for comparisons. For example by removing hyphens and other
symbols.
Each comparison is done according to given threshold which the normalized
result must be equal or above to match. Before the values are compared they will be
converted to lower-case.
During validation the fields are compared and matches are counted per
field, up to the given amount of matches needed is met, causing the
function to return True. If validation ends before this threshold is met
it will return False.
@param field_comparisons: list of comparisons, each which contains a list
of field-value to field-value comparisons.
@type field_comparisons: list
@param threshold: number describing the match threshold a comparison must
exceed to become a positive match.
@type threshold: float
@param matches_needed: number of positive field matches needed for the entire
comparison process to give a positive result.
@type matches_needed: int
@return: tuple of matching result, True if enough matches are found, False if not,
and number of matches.
@rtype: tuple
"""
matches_found
=
0
# Loop over all possible comparisons field by field, if a match is found,
# we are done with this field and break out to try and match next field.
for
comparisons
in
field_comparisons
:
for
value
,
other_value
in
comparisons
:
# Value matching - put values in lower case and remove punctuation
# and trailing zeroes. 'DESY-F35D-97-04' -> 'DESYF35D974'
value
=
re
.
sub
(
'\D[0]|\W+'
,
""
,
value
.
lower
())
other_value
=
re
.
sub
(
'\D[0]|\W+'
,
""
,
other_value
.
lower
())
diff
=
difflib
.
SequenceMatcher
(
None
,
value
,
other_value
)
.
ratio
()
if
diff
>=
threshold
:
matches_found
+=
1
break
# If we already have found required number of matches, we return immediately
if
matches_found
>=
matches_needed
:
return
True
,
matches_found
return
matches_found
>=
matches_needed
,
matches_found
def
compare_fieldvalues_title
(
field_comparisons
,
threshold
,
matches_needed
):
"""
Performs field validation given an list of field comparisons using a method
specialized for comparing titles. For example by looking for possible
concatenated title and subtitles or having a KB of common word aliases.
Each comparison is done according to given threshold which the normalized
result must be equal or above to match.
Before the values are compared they will be massaged by putting all values
lower-case and any leading/trailing spaces are removed.
During validation the fields are compared and matches are counted per
field, up to the given amount of matches needed is met, causing the
function to return True. If validation ends before this threshold is met
it will return False.
@param field_comparisons: list of comparisons, each which contains a list
of field-value to field-value comparisons.
@type field_comparisons: list
@param threshold: number describing the match threshold a comparison must
exceed to become a positive match.
@type threshold: float
@param matches_needed: number of positive field matches needed for the entire
comparison process to give a positive result.
@type matches_needed: int
@return: tuple of matching result, True if enough matches are found, False if not,
and number of matches.
@rtype: tuple
"""
matches_found
=
0
# Loop over all possible comparisons field by field, if a match is found,
# we are done with this field and break out to try and match next field.
for
comparisons
in
field_comparisons
:
for
value
,
other_value
in
comparisons
:
# TODO: KB of alias mappings of common names
title_comparisons
=
[
pair
for
pair
in
_get_grouped_pairs
(
\
get_separated_string_variants
(
value
),
\
get_separated_string_variants
(
other_value
))][
0
]
for
str1
,
str2
in
title_comparisons
:
# Title comparison
diff
=
difflib
.
SequenceMatcher
(
None
,
str1
.
lower
()
.
strip
(),
\
str2
.
lower
()
.
strip
())
.
ratio
()
if
diff
>=
threshold
:
matches_found
+=
1
break
else
:
# We continue as no match was found
continue
# We break out as a match was found
break
# If we already have found required number of matches, we return immediately
if
matches_found
>=
matches_needed
:
return
True
,
matches_found
return
matches_found
>=
matches_needed
,
matches_found
def
compare_fieldvalues_date
(
field_comparisons
,
threshold
,
matches_needed
):
"""
Performs field validation given an list of field comparisons specialized
towards matching dates. Each comparison is done according to given
threshold which the final result must be equal or above to match.
During validation the fields are compared and matches are counted per
field, up to the given amount of matches needed is met, causing the
function to return True. If validation ends before this threshold is met
it will return False.
@param field_comparisons: list of comparisons, each which contains a list
of field-value to field-value comparisons.
@type field_comparisons: list
@param threshold: number describing the match threshold a comparison must
exceed to become a positive match.
@type threshold: float
@param matches_needed: number of positive field matches needed for the entire
comparison process to give a positive result.
@type matches_needed: int
@return: tuple of matching result, True if enough matches are found, False if not,
and number of matches.
@rtype: tuple
"""
matches_found
=
0
# Loop over all possible comparisons field by field, if a match is found,
# we are done with this field and break out to try and match next field.
for
comparisons
in
field_comparisons
:
for
value
,
other_value
in
comparisons
:
value_list
=
re
.
findall
(
'[0-9]{4}'
,
value
.
lower
())
other_value_list
=
re
.
findall
(
'[0-9]{4}'
,
other_value
.
lower
())
for
year1
in
value_list
:
for
year2
in
other_value_list
:
# Value matching - convert values to int
diff
=
compare_numbers
(
int
(
year1
),
int
(
year2
))
if
diff
>=
threshold
:
matches_found
+=
1
break
else
:
continue
break
else
:
continue
break
# If we already have found required number of matches, we return immediately
if
matches_found
>=
matches_needed
:
return
True
,
matches_found
return
matches_found
>=
matches_needed
,
matches_found
def
get_validation_ruleset
(
record
):
"""
This function will iterate over any defined rule-sets in
CFG_BIBMATCH_MATCH_VALIDATION_RULESETS, generating a validation
rule-set for use when comparing records.
in the order of appearance. Meaning that the last rules will have
precedence over earlier one, should MARC tags be conflicting.
You can add your own rule-sets in invenio.conf. The 'default' rule-set
is always applied, but the tag-rules can be overwritten by other
rule-sets. The rule-sets are only allowed to be tuples of two items.
For example: ('980__ \$\$aTHESIS', { tag : (rules) })
* The first part is a string containing a regular expression
that is matched against the textmarc representation of each
record. If a match is found, the final rule-set is updated with
the given "sub rule-set", i.e. second item.
* The second item is a dict that indicates specific MARC tags with
corresponding validation rules.
@param record: bibrec record dict to analyze
@type record: dict
@return: list of ordered rule-sets
@rtype: list
"""
# Convert original record to textmarc in order to regexp search
original_record_marc
=
transform_record_to_marc
(
record
)
# Lets parse the rule-set configuration to try to match rule-sets
# with original record, adding to/overwritin as we go
validation_ruleset
=
{}
for
pattern
,
rules
in
CFG_BIBMATCH_MATCH_VALIDATION_RULESETS
:
if
pattern
==
"default"
or
re
.
search
(
pattern
,
original_record_marc
)
!=
None
:
for
rule
in
rules
:
# Simple validation of rules syntax
if
rule
[
'compare_mode'
]
not
in
CFG_BIBMATCH_VALIDATION_COMPARISON_MODES
:
return
if
rule
[
'match_mode'
]
not
in
CFG_BIBMATCH_VALIDATION_MATCHING_MODES
:
return
if
rule
[
'result_mode'
]
not
in
CFG_BIBMATCH_VALIDATION_RESULT_MODES
:
return
try
:
# Update/Add rule in rule-set
validation_ruleset
[
rule
[
'tags'
]]
=
(
rule
[
'threshold'
],
\
rule
[
'compare_mode'
],
\
rule
[
'match_mode'
],
\
rule
[
'result_mode'
])
except
KeyError
:
# Bad rule-set, return None
return
# Now generate the final list of rules in proper order, so final and joker result-modes
# are executed before normal rules. Order of precedence: final, joker, normal
final_list
=
[]
joker_list
=
[]
normal_list
=
[]
for
tag
,
(
threshold
,
compare_mode
,
match_mode
,
result_mode
)
in
validation_ruleset
.
iteritems
():
if
compare_mode
==
'ignored'
or
threshold
<=
0.0
:
# Ignore rule
continue
if
result_mode
==
'final'
:
final_list
.
append
((
tag
,
threshold
,
compare_mode
,
match_mode
,
result_mode
))
elif
result_mode
==
'joker'
:
joker_list
.
append
((
tag
,
threshold
,
compare_mode
,
match_mode
,
result_mode
))
else
:
normal_list
.
append
((
tag
,
threshold
,
compare_mode
,
match_mode
,
result_mode
))
return
final_list
+
joker_list
+
normal_list
def
validate_tag
(
field_tag
):
"""
This function will return a tuple of (tag, ind1, ind2, code) as extracted
from given string. If the tag is not deemed valid: return None.
For example: "100__a" will return ('100', '', '', 'a')
@param field_tag: field tag to extract MARC parts from
@type field_tag: string
@return: tuple of MARC tag parts, tag, ind1, ind2, code
@rtype: tuple
"""
if
re_valid_tag
.
match
(
field_tag
)
!=
None
:
tag
=
field_tag
[
0
:
3
]
ind1
=
field_tag
[
3
:
4
]
ind2
=
field_tag
[
4
:
5
]
code
=
field_tag
[
5
:
6
]
if
ind1
==
"_"
:
ind1
=
""
if
ind2
==
"_"
:
ind2
=
""
return
tag
,
ind1
,
ind2
,
code
return
None
def
get_paired_comparisons
(
first_list
,
second_list
,
ignore_order
=
True
):
"""
This function will return a a list of comparisons, each which contains
a list of all the possible unique item to item comparisons.
If ordering is required, the lists must be of same length and the
comparisons will be single item by item comparisons.
@param first_list: a iterable to pair with second_list items
@type first_list: iterable
@param second_list: an iterable to be paired against first_list
@type first_list: iterable
@return: the resulting iterable of pairs grouped by first_list items
@rtype: iterable
"""
if
ignore_order
:
# Get grouped permutations of comparisons between subfields
paired_comparisons
=
_get_grouped_pairs
(
first_list
,
second_list
)
else
:
# Must have same number of items
if
len
(
first_list
)
!=
len
(
second_list
):
return
[]
# Now prepare direct one-to-one comparisons
paired_comparisons
=
[((
first_list
[
i
],
second_list
[
i
]),)
\
for
i
in
range
(
0
,
len
(
first_list
))]
return
paired_comparisons
def
compare_numbers
(
num1
,
num2
):
"""
This function will try to compare two numbers to each other,
returning the normalized distance between them. The value
returned will be between 0.0 - 1.0, with 1.0 being a full
match, decreasing 0.1 per year in difference.
Inspired by similar function in MarcXimil
(http://marcximil.sourceforge.net/).
@param num1: the first number to compare
@type num1: int
@param num2: the second number to compare
@type num2: int
@return: the normalized equality score between 0.0 and 1.0
@rtype: float
"""
return
1.0
-
(
abs
(
num1
-
num2
)
*
0.1
)
def
get_separated_string_variants
(
s
,
sep
=
':'
):
"""
This function will return a list of all the possible combinations
of substrings of given title when separated by given separator.
For example:
"scalar tensor theory : validity of Cosmic no hair conjecture"
produces:
['scalar tensor theory ',
' validity of Cosmic no hair conjecture',
'scalar tensor theory : validity of Cosmic no hair conjecture']
It also returns variants containing several separators:
"scalar tensor theory : validity of Cosmic no hair : conjecture"
produces:
['scalar tensor theory ',
' validity of Cosmic no hair : conjecture',
'scalar tensor theory : validity of Cosmic no hair ',
' conjecture',
'scalar tensor theory : validity of Cosmic no hair : conjecture']
@param s: string to generate variants from
@type s: string
@param sep: separator that splits the string in two. Defaults to colon (:).
@type sep: string
@return: list of strings
@rtype: list
"""
string_variants
=
[]
str_parts
=
s
.
split
(
sep
)
start_index
=
1
for
dummy
in
str_parts
:
first_part
=
sep
.
join
(
str_parts
[:
start_index
])
if
first_part
!=
''
:
string_variants
.
append
(
first_part
)
last_part
=
sep
.
join
(
str_parts
[
start_index
:])
if
last_part
!=
''
:
string_variants
.
append
(
last_part
)
if
start_index
<=
len
(
str_parts
):
start_index
+=
1
else
:
break
return
string_variants
def
get_reversed_string_variants
(
s
,
sep
=
','
):
"""
This function will return a tuple containing a pair of the original
string and the reversed version, with regards to text before/after the
separator (on first encounter of said separator).
For example, "lastname, firstname", "firstname, lastname"
@param s: string to extract pair from
@type s: string
@param sep: separator that splits the string in two. Defaults to comma (,).
@type sep: string
@return: tuple of strings
@rtype: tuple
"""
# Extract the different parts of the name using partition function.
left
,
sep
,
right
=
string_partition
(
s
,
sep
)
return
(
left
+
sep
+
right
,
right
+
sep
+
left
)
def
_get_grouped_pairs
(
first_list
,
second_list
):
"""
This function will return a list of grouped pairs of items from
the first list with every item in the second list.
e.g. [1,2,3],[4,5] -> [([1, 4], [1, 5]),
([2, 4], [2, 5]),
([3, 4], [3, 5])]
@param first_list: an iterable to pair with second_list items
@type first_list: iterable
@param second_list: an iterable to be paired against first_list
@type second_list: iterable
@return: the resulting iterable of pairs grouped by first_list items
@rtype: iterable
"""
pairs
=
[]
for
first_item
in
first_list
:
pair_group
=
[]
for
second_item
in
second_list
:
pair_group
.
append
((
first_item
,
second_item
))
pairs
.
append
(
tuple
(
pair_group
))
return
pairs
Event Timeline
Log In to Comment