Page MenuHomec4science

bibclassify_text_normalizer.py
No OneTemporary

File Metadata

Created
Tue, May 7, 05:19

bibclassify_text_normalizer.py

# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibClassify text_normalizer.
This module provides methods to clean the text lines. Currently, the methods
are tuned to work with the output of pdftotext and documents in the HEP field.
Methods can be tuned to your needs through the configuration file.
This modules uses the refextract module of BibEdit in order to find the
references section and to replace unicode characters.
"""
import sys
import re
try:
from bibclassify_utils import write_message
except ImportError, err:
print >> sys.stderr, "Import error: %s" % err
sys.exit(0)
try:
from invenio.refextract import replace_undesirable_characters, \
find_reference_section, find_end_of_reference_section
except ImportError:
# Running in standalone mode.
try:
from refextract import replace_undesirable_characters, \
find_reference_section, find_end_of_reference_section
except ImportError:
sys.exit(0)
def normalize_fulltext(fulltext):
"""Returns a 'cleaned' version of the output provided by pdftotext."""
# We recognize keywords by the spaces. We need these to match the
# first and last words of the document.
fulltext = " " + fulltext + " "
# Replace some weird unicode characters.
fulltext = replace_undesirable_characters(fulltext)
# Replace the greek characters by their name.
fulltext = _replace_greek_characters(fulltext)
washing_regex = [
# Replace non and anti with non- and anti-. This allows a better
# detection of keywords such as nonabelian.
(re.compile(r"(\snon)[- ](\w+)"), r"\1\2"),
(re.compile(r"(\santi)[- ](\w+)"), r"\1\2"),
# Remove all leading numbers (e.g. 2-pion -> pion).
(re.compile(r"\s\d-"), " "),
# Remove multiple spaces.
(re.compile(r" +"), " "),
]
# Remove spaces in particle names.
# Particles with -/+/*
washing_regex += [(re.compile(r"(%s) ([-+*])" % name), r"\1\2")
for name in ("c", "muon", "s", "B", "D", "K", "Lambda",
"Mu", "Omega", "Pi", "Sigma", "Tau", "W", "Xi")]
# Particles followed by numbers
washing_regex += [(re.compile(r"(%s) ([0-9]\W)" % name), r"\1\2")
for name in ("a", "b", "c", "f", "h", "s", "B", "D", "H",
"K", "L", "Phi", "Pi", "Psi","Rho", "Stor", "UA",
"Xi", "Z")]
washing_regex += [(re.compile(r"(\W%s) ?\( ?([0-9]+) ?\)[A-Z]?" % name),
r"\1(\2)")
for name in ("CP", "E", "G", "O", "S", "SL", "SO",
"Spin", "SU", "U", "W", "Z")]
# Particles with '
washing_regex += [(re.compile(r"(\W%s) ('\W)" % name), r"\1\2")
for name in ("Eta", "W", "Z")]
# Particles with (N)
washing_regex += [(re.compile(r"(\W%s) ?\( ?N ?\)[A-Z]?" % name), r"\1(N)")
for name in ("CP", "GL", "O", "SL", "SO", "Sp", "Spin",
"SU", "U", "W", "Z")]
# All names followed by ([0-9]{3,4})
washing_regex.append((re.compile(r"([A-Za-z]) (\([0-9]{3,4}\)\+?)\s"),
r"\1\2 "))
# Some weird names followed by ([0-9]{3,4})
washing_regex += [(re.compile(r"\(%s\) (\([0-9]{3,4}\))" % name),
r"\1\2 ")
for name in ("a0", "Ds1", "Ds2", "K\*")]
washing_regex += [
# Remove all lonel operators (usually these are errors
# introduced by pdftotext.)
(re.compile(r" [+*] "), r" "),
# Remove multiple spaces.
(re.compile(r" +"), " "),
# Remove multiple line breaks.
(re.compile(r"\n+"), r"\n"),
]
# Apply the regular expressions to the fulltext.
for regex, replacement in washing_regex:
fulltext = regex.sub(replacement, fulltext)
return fulltext
def cut_references(text_lines):
"""Returns the text lines with the references cut."""
ref_sect_start = find_reference_section(text_lines)
if ref_sect_start is not None:
start = ref_sect_start["start_line"]
end = find_end_of_reference_section(text_lines, start,
ref_sect_start["marker"], ref_sect_start["marker_pattern"])
del text_lines[start:end + 1]
else:
write_message("WARNING: No references could be found.",
stream=sys.stderr, verbose=2)
return text_lines
return text_lines
_GREEK_REPLACEMENTS = {
u'\u00AF' : u' ',
u'\u00B5' : u' Mu ',
u'\u00D7' : u' x ',
u'\u0391' : u' Alpha ',
u'\u0392' : u' Beta ',
u'\u0393' : u' Gamma ',
u'\u0394' : u' Delta ',
u'\u0395' : u' Epsilon ',
u'\u0396' : u' Zeta ',
u'\u0397' : u' Eta ',
u'\u0398' : u' Theta ',
u'\u0399' : u' Iota ',
u'\u039A' : u' Kappa ',
u'\u039B' : u' Lambda ',
u'\u039C' : u' Mu ',
u'\u039D' : u' Nu ',
u'\u039E' : u' Xi ',
u'\u039F' : u' Omicron ',
u'\u03A0' : u' Pi ',
u'\u03A1' : u' Rho ',
u'\u03A3' : u' Sigma ',
u'\u03A4' : u' Tau ',
u'\u03A5' : u' Upsilon ',
u'\u03A6' : u' Phi ',
u'\u03A7' : u' Chi ',
u'\u03A8' : u' Psi ',
u'\u03A9' : u' Omega ',
u'\u03B1' : u' Alpha ',
u'\u03B2' : u' Beta ',
u'\u03B3' : u' Gamma ',
u'\u03B4' : u' Delta ',
u'\u03B5' : u' Epsilon ',
u'\u03B6' : u' Zeta ',
u'\u03B7' : u' Eta ',
u'\u03B8' : u' Theta ',
u'\u03B9' : u' Iota ',
u'\u03BA' : u' Kappa ',
u'\u03BB' : u' Lambda ',
u'\u03BC' : u' Mu ',
u'\u03BD' : u' Nu ',
u'\u03BE' : u' Xi ',
u'\u03BF' : u' Omicron ',
u'\u03C0' : u' Pi ',
u'\u03C1' : u' Rho ',
u'\uC3C2' : u' Sigma ',
u'\u03C3' : u' Sigma ',
u'\u03C4' : u' Tau ',
u'\u03C5' : u' Upsilon ',
u'\u03C6' : u' Phi ',
u'\u03C7' : u' Chi ',
u'\u03C8' : u' Psi ',
u'\u03C9' : u' Omega ',
u'\u03CA' : u' Iota ',
u'\u03CB' : u' Upsilon ',
u'\u03CC' : u' Omicron ',
u'\u03CD' : u' Upsilon ',
u'\u03CE' : u' Omega ',
u'\u03CF' : u' Kai ',
u'\u03D0' : u' Beta ',
u'\u03D1' : u' Theta ',
u'\u03D2' : u' Upsilon ',
u'\u03D3' : u' Upsilon ',
u'\u03D4' : u' Upsilon ',
u'\u03D5' : u' Phi ',
u'\u03D6' : u' Pi ',
u'\u03D7' : u' Kai ',
u'\u03D8' : u' Koppa ',
u'\u03D9' : u' Koppa ',
u'\u03DA' : u' Stigma ',
u'\u03DB' : u' Stigma ',
u'\u03DC' : u' Digamma ',
u'\u03DD' : u' Digamma ',
u'\u03DE' : u' Koppa ',
u'\u03DF' : u' Koppa ',
u'\u03E0' : u' Sampi ',
u'\u03E1' : u' Sampi ',
u'\u03D1' : u' Theta ',
u'\u03D5' : u' Phi ',
u'\u2010' : u'-',
u'\u2011' : u'-',
u'\u2012' : u'-',
u'\u2013' : u'-',
u'\u2014' : u'-',
u'\u2015' : u'-',
u'\u2019' : u"'",
u'\u2032' : u"'",
u'\u2126' : u' Omega ',
u'\u2206' : u' Delta ',
u'\u2212' : u'-',
u'\u2215' : u"/",
u'\u2216' : u"\\",
u'\u2217' : u"*",
u'\u221D' : u' Alpha ',
}
def _replace_greek_characters(line):
"""Replace greek characters in a string."""
for greek_char, replacement in _GREEK_REPLACEMENTS.iteritems():
try:
line = line.replace(greek_char, replacement)
except UnicodeDecodeError:
write_message("WARNING: Unicode decoding error.",
stream=sys.stderr, verbose=2)
return ""
return line

Event Timeline