diff --git a/modules/bibclassify/lib/bibclassify_text_normalizer.py b/modules/bibclassify/lib/bibclassify_text_normalizer.py index 9b7219086..75fc6ee1c 100644 --- a/modules/bibclassify/lib/bibclassify_text_normalizer.py +++ b/modules/bibclassify/lib/bibclassify_text_normalizer.py @@ -1,231 +1,237 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Bibclassify text_normalizer """ __revision__ = "$Id$" import sys import re try: - from refextract import replace_undesirable_characters + from invenio.refextract import replace_undesirable_characters except ImportError, e1: print >> sys.stderr, "Error: %s" % e1 sys.exit(1) def normalize_fulltext(fulltext): """Returns a 'cleaned' version of the output provided by pdftotext.""" - - fulltext = replace_greek_characters(fulltext) - # We recognize keywords by the spaces. We need these to match the # first and last words of the document. fulltext = " " + fulltext + " " # Replace some weird unicode characters. fulltext = replace_undesirable_characters(fulltext) # Replace the greek characters by their name. fulltext = replace_greek_characters(fulltext) - # Reformat the punctuation. - fulltext = re.sub(r" *([,.]) *", r"\1 ", fulltext) - # Replace non with non-. This allows a better detection of keywords - # such as nonabelian. - fulltext = re.sub(r"(\snon)[- ](\w+)", r"\1\2", fulltext) - fulltext = re.sub(r"(\santi)[- ](\w+)", r"\1\2", fulltext) - # Remove all leading numbers (e.g. 2-pion -> pion) - fulltext = re.sub(r"\s\d-", " ", fulltext) - - # Remove multiple spaces - fulltext = re.sub(r" +", " ", fulltext) - - ## Remove spaces in particle names, + washing_regex = [ + (re.compile(r" *([,.]) *"), r"\1 "), + # Replace non and anti with non- and anti-. This allows a better + # detection of keywords such as nonabelian. + (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"), + (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"), + # Remove all leading numbers (e.g. 2-pion -> pion). + (re.compile(r"\s\d-"), " "), + # Remove multiple spaces. + (re.compile(r" +"), " "), + ] + + # Remove spaces in particle names. # Particles with -/+/* - for name in ("c", "muon", "s", "B", "D", "K", "Lambda", "Mu", "Omega", - "Pi", "Sigma", "Tau", "W", "Xi"): - fulltext = re.sub(r"(%s) ([-+*])" % name, r"\1\2", fulltext) - # Particles followed directly by a number. - for name in ("a", "b", "c", "f", "h", "s", "B", "D", "H", "K", "L", - "Phi", "Pi", "Psi","Rho", "Stor", "UA", "Xi", "Z"): - fulltext = re.sub(r"(%s) ([0-9]\W)" % name, r"\1\2", fulltext) + washing_regex += [(re.compile(r"(%s) ([-+*])" % name), r"\1\2") + for name in ("c", "muon", "s", "B", "D", "K", "Lambda", + "Mu", "Omega", "Pi", "Sigma", "Tau", "W", "Xi")] + + # Particles followed by numbers + washing_regex += [(re.compile(r"(%s) ([0-9]\W)" % name), r"\1\2") + for name in ("a", "b", "c", "f", "h", "s", "B", "D", "H", + "K", "L", "Phi", "Pi", "Psi","Rho", "Stor", "UA", + "Xi", "Z")] + washing_regex += [(re.compile(r"(\W%s) ?\( ?([0-9]+) ?\)[A-Z]?" % name), + r"\1(\2)") + for name in ("CP", "E", "G", "O", "S", "SL", "SO", + "Spin", "SU", "U", "W", "Z")] + # Particles with ' - for name in ("Eta", "W", "Z"): - fulltext = re.sub(r"(\W%s) ('\W)" % name, r"\1\2", fulltext) + washing_regex += [(re.compile(r"(\W%s) ('\W)" % name), r"\1\2") + for name in ("Eta", "W", "Z")] + # Particles with (N) - for name in ("CP", "GL", "O", "SL", "SO", "Sp", "Spin", "SU", "U", "W", - "Z"): - fulltext = re.sub(r"(\W%s) ?\( ?N ?\)[A-Z]?" % name, r"\1(N)", fulltext) - # Particles with ([0-9]+) - for name in ("CP", "E", "G", "O", "S", "SL", "SO", "Spin", "SU", "U", "W", - "Z"): - fulltext = re.sub(r"(\W%s) ?\( ?([0-9]+) ?\)[A-Z]?" % name, r"\1(\2)", - fulltext) + washing_regex += [(re.compile(r"(\W%s) ?\( ?N ?\)[A-Z]?" % name), r"\1(N)") + for name in ("CP", "GL", "O", "SL", "SO", "Sp", "Spin", + "SU", "U", "W", "Z")] # All names followed by ([0-9]{3,4}) - fulltext = re.sub(r"([A-Za-z]) (\([0-9]{3,4}\)\+?)\s", r"\1\2 ", fulltext) + washing_regex.append((re.compile(r"([A-Za-z]) (\([0-9]{3,4}\)\+?)\s"), + r"\1\2 ")) # Some weird names followed by ([0-9]{3,4}) - for name in ("a0", "Ds1", "Ds2", "K*"): - fulltext = re.sub(r"%s (\([0-9]{3,4}\))" % re.escape(name), - r"%s\1 " % name, fulltext) - - # Remove all lonel operators (usually these are errors introduced by - # pdftotext.) - fulltext = re.sub(r" [+*] ", r" ", fulltext) - - # Remove multiple spaces. - fulltext = re.sub(r" +", " ", fulltext) - # Remove multiple line breaks. - fulltext = re.sub(r"\n+", r"\n", fulltext) + washing_regex += [(re.compile(r"\(%s\) (\([0-9]{3,4}\))" % name), + r"\1\2 ") + for name in ("a0", "Ds1", "Ds2", "K\*")] + + washing_regex += [ + # Remove all lonel operators (usually these are errors + # introduced by pdftotext.) + (re.compile(r" [+*] "), r" "), + # Remove multiple spaces. + (re.compile(r" +"), " "), + # Remove multiple line breaks. + (re.compile(r"\n+"), r"\n"), + ] + + # Apply the regular expressions to the fulltext. + for regex, replacement in washing_regex: + fulltext = regex.sub(replacement, fulltext) return fulltext def cut_references(text_lines): """Returns the text lines with the references cut.""" try: - from refextract import find_reference_section, \ + from invenio.refextract import find_reference_section, \ find_end_of_reference_section except ImportError: print >> sys.stderr, ("Impossible to import refextract. Working on " "full document.") return text_lines ref_sect_start = find_reference_section(text_lines) if ref_sect_start is not None: start = ref_sect_start["start_line"] end = find_end_of_reference_section(text_lines, start, ref_sect_start["marker"], ref_sect_start["marker_pattern"]) del text_lines[start:end + 1] else: print >> sys.stderr, "No references could be found." return text_lines return text_lines _GREEK_REPLACEMENTS = { u'\u00AF' : u' ', u'\u00B5' : u' Mu ', u'\u00D7' : u' x ', u'\u0391' : u' Alpha ', u'\u0392' : u' Beta ', u'\u0393' : u' Gamma ', u'\u0394' : u' Delta ', u'\u0395' : u' Epsilon ', u'\u0396' : u' Zeta ', u'\u0397' : u' Eta ', u'\u0398' : u' Theta ', u'\u0399' : u' Iota ', u'\u039A' : u' Kappa ', u'\u039B' : u' Lambda ', u'\u039C' : u' Mu ', u'\u039D' : u' Nu ', u'\u039E' : u' Xi ', u'\u039F' : u' Omicron ', u'\u03A0' : u' Pi ', u'\u03A1' : u' Rho ', u'\u03A3' : u' Sigma ', u'\u03A4' : u' Tau ', u'\u03A5' : u' Upsilon ', u'\u03A6' : u' Phi ', u'\u03A7' : u' Chi ', u'\u03A8' : u' Psi ', u'\u03A9' : u' Omega ', u'\u03B1' : u' Alpha ', u'\u03B2' : u' Beta ', u'\u03B3' : u' Gamma ', u'\u03B4' : u' Delta ', u'\u03B5' : u' Epsilon ', u'\u03B6' : u' Zeta ', u'\u03B7' : u' Eta ', u'\u03B8' : u' Theta ', u'\u03B9' : u' Iota ', u'\u03BA' : u' Kappa ', u'\u03BB' : u' Lambda ', u'\u03BC' : u' Mu ', u'\u03BD' : u' Nu ', u'\u03BE' : u' Xi ', u'\u03BF' : u' Omicron ', u'\u03C0' : u' Pi ', u'\u03C1' : u' Rho ', u'\uC3C2' : u' Sigma ', u'\u03C3' : u' Sigma ', u'\u03C4' : u' Tau ', u'\u03C5' : u' Upsilon ', u'\u03C6' : u' Phi ', u'\u03C7' : u' Chi ', u'\u03C8' : u' Psi ', u'\u03C9' : u' Omega ', u'\u03CA' : u' Iota ', u'\u03CB' : u' Upsilon ', u'\u03CC' : u' Omicron ', u'\u03CD' : u' Upsilon ', u'\u03CE' : u' Omega ', u'\u03CF' : u' Kai ', u'\u03D0' : u' Beta ', u'\u03D1' : u' Theta ', u'\u03D2' : u' Upsilon ', u'\u03D3' : u' Upsilon ', u'\u03D4' : u' Upsilon ', u'\u03D5' : u' Phi ', u'\u03D6' : u' Pi ', u'\u03D7' : u' Kai ', u'\u03D8' : u' Koppa ', u'\u03D9' : u' Koppa ', u'\u03DA' : u' Stigma ', u'\u03DB' : u' Stigma ', u'\u03DC' : u' Digamma ', u'\u03DD' : u' Digamma ', u'\u03DE' : u' Koppa ', u'\u03DF' : u' Koppa ', u'\u03E0' : u' Sampi ', u'\u03E1' : u' Sampi ', u'\u03D1' : u' Theta ', u'\u03D5' : u' Phi ', u'\u2010' : u'-', u'\u2011' : u'-', u'\u2012' : u'-', u'\u2013' : u'-', u'\u2014' : u'-', u'\u2015' : u'-', u'\u2019' : u"'", u'\u2032' : u"'", u'\u2126' : u' Omega ', u'\u2206' : u' Delta ', u'\u2212' : u'-', u'\u2215' : u"/", u'\u2216' : u"\\", u'\u2217' : u"*", u'\u221D' : u' Alpha ', } def replace_greek_characters(line): """Replace greek characters in a string.""" for greek_char, replacement in _GREEK_REPLACEMENTS.iteritems(): try: line = line.replace(greek_char, replacement) except UnicodeDecodeError, err: print err return "" return line