diff --git a/modules/bibclassify/lib/bibclassify_text_normalizer.py b/modules/bibclassify/lib/bibclassify_text_normalizer.py index 75fc6ee1c..4053e3a65 100644 --- a/modules/bibclassify/lib/bibclassify_text_normalizer.py +++ b/modules/bibclassify/lib/bibclassify_text_normalizer.py @@ -1,237 +1,247 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Bibclassify text_normalizer """ __revision__ = "$Id$" import sys import re try: from invenio.refextract import replace_undesirable_characters except ImportError, e1: - print >> sys.stderr, "Error: %s" % e1 - sys.exit(1) + try: + from refextract import replace_undesirable_characters + except ImportError, err: + print >> sys.stderr, "Error: %s" % err + sys.exit(1) def normalize_fulltext(fulltext): """Returns a 'cleaned' version of the output provided by pdftotext.""" # We recognize keywords by the spaces. We need these to match the # first and last words of the document. fulltext = " " + fulltext + " " # Replace some weird unicode characters. fulltext = replace_undesirable_characters(fulltext) # Replace the greek characters by their name. fulltext = replace_greek_characters(fulltext) washing_regex = [ (re.compile(r" *([,.]) *"), r"\1 "), # Replace non and anti with non- and anti-. This allows a better # detection of keywords such as nonabelian. (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"), (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"), # Remove all leading numbers (e.g. 2-pion -> pion). (re.compile(r"\s\d-"), " "), # Remove multiple spaces. (re.compile(r" +"), " "), ] # Remove spaces in particle names. # Particles with -/+/* washing_regex += [(re.compile(r"(%s) ([-+*])" % name), r"\1\2") for name in ("c", "muon", "s", "B", "D", "K", "Lambda", "Mu", "Omega", "Pi", "Sigma", "Tau", "W", "Xi")] # Particles followed by numbers washing_regex += [(re.compile(r"(%s) ([0-9]\W)" % name), r"\1\2") for name in ("a", "b", "c", "f", "h", "s", "B", "D", "H", "K", "L", "Phi", "Pi", "Psi","Rho", "Stor", "UA", "Xi", "Z")] washing_regex += [(re.compile(r"(\W%s) ?\( ?([0-9]+) ?\)[A-Z]?" % name), r"\1(\2)") for name in ("CP", "E", "G", "O", "S", "SL", "SO", "Spin", "SU", "U", "W", "Z")] # Particles with ' washing_regex += [(re.compile(r"(\W%s) ('\W)" % name), r"\1\2") for name in ("Eta", "W", "Z")] # Particles with (N) washing_regex += [(re.compile(r"(\W%s) ?\( ?N ?\)[A-Z]?" % name), r"\1(N)") for name in ("CP", "GL", "O", "SL", "SO", "Sp", "Spin", "SU", "U", "W", "Z")] # All names followed by ([0-9]{3,4}) washing_regex.append((re.compile(r"([A-Za-z]) (\([0-9]{3,4}\)\+?)\s"), r"\1\2 ")) # Some weird names followed by ([0-9]{3,4}) washing_regex += [(re.compile(r"\(%s\) (\([0-9]{3,4}\))" % name), r"\1\2 ") for name in ("a0", "Ds1", "Ds2", "K\*")] washing_regex += [ # Remove all lonel operators (usually these are errors # introduced by pdftotext.) (re.compile(r" [+*] "), r" "), # Remove multiple spaces. (re.compile(r" +"), " "), # Remove multiple line breaks. (re.compile(r"\n+"), r"\n"), ] # Apply the regular expressions to the fulltext. for regex, replacement in washing_regex: fulltext = regex.sub(replacement, fulltext) return fulltext def cut_references(text_lines): """Returns the text lines with the references cut.""" try: from invenio.refextract import find_reference_section, \ find_end_of_reference_section except ImportError: - print >> sys.stderr, ("Impossible to import refextract. Working on " - "full document.") - return text_lines + # Needed for bibclassify to run in standalone mode. + try: + from refextract import find_reference_section, \ + find_end_of_reference_section + except ImportError, err1: + print >> sys.stderr, "Error: %s" % err1 + sys.exit(1) + print >> sys.stderr, ("Impossible to import refextract. Working on " + "full document.") + return text_lines ref_sect_start = find_reference_section(text_lines) if ref_sect_start is not None: start = ref_sect_start["start_line"] end = find_end_of_reference_section(text_lines, start, ref_sect_start["marker"], ref_sect_start["marker_pattern"]) del text_lines[start:end + 1] else: print >> sys.stderr, "No references could be found." return text_lines return text_lines _GREEK_REPLACEMENTS = { u'\u00AF' : u' ', u'\u00B5' : u' Mu ', u'\u00D7' : u' x ', u'\u0391' : u' Alpha ', u'\u0392' : u' Beta ', u'\u0393' : u' Gamma ', u'\u0394' : u' Delta ', u'\u0395' : u' Epsilon ', u'\u0396' : u' Zeta ', u'\u0397' : u' Eta ', u'\u0398' : u' Theta ', u'\u0399' : u' Iota ', u'\u039A' : u' Kappa ', u'\u039B' : u' Lambda ', u'\u039C' : u' Mu ', u'\u039D' : u' Nu ', u'\u039E' : u' Xi ', u'\u039F' : u' Omicron ', u'\u03A0' : u' Pi ', u'\u03A1' : u' Rho ', u'\u03A3' : u' Sigma ', u'\u03A4' : u' Tau ', u'\u03A5' : u' Upsilon ', u'\u03A6' : u' Phi ', u'\u03A7' : u' Chi ', u'\u03A8' : u' Psi ', u'\u03A9' : u' Omega ', u'\u03B1' : u' Alpha ', u'\u03B2' : u' Beta ', u'\u03B3' : u' Gamma ', u'\u03B4' : u' Delta ', u'\u03B5' : u' Epsilon ', u'\u03B6' : u' Zeta ', u'\u03B7' : u' Eta ', u'\u03B8' : u' Theta ', u'\u03B9' : u' Iota ', u'\u03BA' : u' Kappa ', u'\u03BB' : u' Lambda ', u'\u03BC' : u' Mu ', u'\u03BD' : u' Nu ', u'\u03BE' : u' Xi ', u'\u03BF' : u' Omicron ', u'\u03C0' : u' Pi ', u'\u03C1' : u' Rho ', u'\uC3C2' : u' Sigma ', u'\u03C3' : u' Sigma ', u'\u03C4' : u' Tau ', u'\u03C5' : u' Upsilon ', u'\u03C6' : u' Phi ', u'\u03C7' : u' Chi ', u'\u03C8' : u' Psi ', u'\u03C9' : u' Omega ', u'\u03CA' : u' Iota ', u'\u03CB' : u' Upsilon ', u'\u03CC' : u' Omicron ', u'\u03CD' : u' Upsilon ', u'\u03CE' : u' Omega ', u'\u03CF' : u' Kai ', u'\u03D0' : u' Beta ', u'\u03D1' : u' Theta ', u'\u03D2' : u' Upsilon ', u'\u03D3' : u' Upsilon ', u'\u03D4' : u' Upsilon ', u'\u03D5' : u' Phi ', u'\u03D6' : u' Pi ', u'\u03D7' : u' Kai ', u'\u03D8' : u' Koppa ', u'\u03D9' : u' Koppa ', u'\u03DA' : u' Stigma ', u'\u03DB' : u' Stigma ', u'\u03DC' : u' Digamma ', u'\u03DD' : u' Digamma ', u'\u03DE' : u' Koppa ', u'\u03DF' : u' Koppa ', u'\u03E0' : u' Sampi ', u'\u03E1' : u' Sampi ', u'\u03D1' : u' Theta ', u'\u03D5' : u' Phi ', u'\u2010' : u'-', u'\u2011' : u'-', u'\u2012' : u'-', u'\u2013' : u'-', u'\u2014' : u'-', u'\u2015' : u'-', u'\u2019' : u"'", u'\u2032' : u"'", u'\u2126' : u' Omega ', u'\u2206' : u' Delta ', u'\u2212' : u'-', u'\u2215' : u"/", u'\u2216' : u"\\", u'\u2217' : u"*", u'\u221D' : u' Alpha ', } def replace_greek_characters(line): """Replace greek characters in a string.""" for greek_char, replacement in _GREEK_REPLACEMENTS.iteritems(): try: line = line.replace(greek_char, replacement) - except UnicodeDecodeError, err: - print err + except UnicodeDecodeError, err1: + print err1 return "" return line