Page MenuHomec4science

refextract.py
No OneTemporary

File Metadata

Created
Mon, Nov 11, 01:46

refextract.py

# -*- coding: utf-8 -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
try:
import sys, re, string, time
import os, getopt, cgi
from cStringIO import StringIO
from invenio.refextract_config import *
except ImportError, e:
raise ImportError(e)
class StringBuffer1:
"""This class is a String buffer, used for concatenation of strings.
This version uses a memory file as a string buffer
"""
def __init__(self):
self._bufferFile = StringIO()
def append(self, itm):
"""Add a string to the string buffer"""
self._bufferFile.write("%s"%(itm,))
def get(self):
"""Get buffered string and return it as string object"""
return self._bufferFile.getvalue()
class StringBuffer2:
"""This class is a String buffer, used for concatenation of strings.
This version uses a list as a string buffer
"""
def __init__(self):
self._buffer = []
def append(self, itm):
"""Add a new string into the buffer"""
self._buffer.append(itm)
def get(self):
"""Join all strings in th buffer into a single string and return it"""
return ''.join(self._buffer)
class SystemMessage:
def __init__(self):
self._helpMessage = """refextract recid:pdffile [recid:pdffile]"""
self._versionMessage = cfg_refextract_version
def getHelpMessage(self):
return self._helpMessage
def getVersionMessage(self):
return self._versionMessage
class ReferenceSection:
"""Concrete class representing the Reference section of a document. Once References have been extracted, they are put into a
ReferenceSection object, which contains a list of "ReferenceLine" objects
"""
class ReferenceSectionIterator:
def __init__(self, reflines):
self._mylist = reflines
self._listptr = 0
def next(self):
try:
item = self._mylist[self._listptr]
self._listptr += 1
return item
except IndexError:
raise StopIteration
def __init__(self, refLineStrings = []):
"""Initialise a ReferenceSection object with the lines composing the references of a document. If a string argument
is supplied, it will be appended as the first reference line. If a list argument is supplied, each element of the list
that contains a string will be appended to the list of reference lines in order. Arguments of neither String or
list type will be ignored and an empty ReferenceSection object will be the result
"""
self._referenceLines = []
self._lnPtr = 0
if type(refLineStrings) is list:
for line in refLineStrings:
self.addNewLine(line)
else:
if not self.addNewLine(refLineStrings):
self._referenceLines = []
self.resetLinePointer()
def __iter__(self):
"""return self as iterator object"""
return ReferenceSection.ReferenceSectionIterator(self._referenceLines)
def resetLinePointer(self):
"""Reset the position of the ReferenceLine pointer of a ReferenceSection object to point at the first line"""
self._lnPtr = 0
def gotoNextLine(self):
"""Move the position of the ReferenceLine pointer of a ReferenceSection object to point at the next line"""
if self.lineExists(self._lnPtr+1):
self._lnPtr = self._lnPtr+1
return True
else:
return False
def gotoLine(self, lNum):
"""Move the position of the ReferenceLine pointer of a ReferenceSection object to point at the line number supplied"""
if self.lineExists(lNum-1):
self._lnPtr = lNum-1
return True
else:
return False
def getCurrentLineAsString(self):
"""Return a String containing the text contents of the ReferenceLine object currently pointed to by the internal
pointer of a ReferenceSection object. Returns empty string if no ReferenceLine is currently pointed at
"""
if self.lineExists(self._lnPtr):
return self._referenceLines[self._lnPtr].getContent()
else:
self.resetLinePointer()
return u''
def getCurrentLine(self):
"""Return the ReferenceLine object that is currently pointed to by the internal pointer of a ReferenceSection object. If no
object pointed at, return the 'None' object
"""
if self.lineExists(self._lnPtr):
return self._referenceLines[self._lnPtr]
else:
self.resetLinePointer()
return None
def getLineAsString(self, lNum):
"""Return a String containing the text contents of the ReferenceLine object at the line number supplied (1..n).
Returns ain empty String if line number does not exist
"""
if self.lineExists(lNum-1): return self._referenceLines[lNum-1].getContent()
else: return u""
def getLine(self, lNum):
"""Return the ReferenceLine at the line number supplied (1..n) Returns 'None' object if line number does not exist"""
if self.lineExists(lNum-1): return self._referenceLines[lNum-1]
else: return None
def displayAllLines(self):
"""Display all ReferenceLine objects stored within a ReferenceSection object consecutively as Strings on the standard output stream"""
for x in self._referenceLines: x.display()
def displayCurrentLine(self):
"""Display the ReferenceLine that is currently pointed to by a ReferenceSection object"""
if self.lineExists(self._lnPtr): self._referenceLines[self._lnPtr].display()
else: self.resetLinePointer()
def displayLine(self, lNum):
"""Display the reference line at the line number supplied (1..n). Will display nothing if the line number does not exist"""
if self.lineExists(lNum-1): self._referenceLines[lNum-1].display()
def lineExists(self, lNum):
"""Returns True if line lNum exists in a ReferenceSection, False if not. (Reminder: Lines in the range 0..N)"""
if (lNum < len(self._referenceLines)) and (lNum >= 0): return True
else: return False
def addNewLine(self, lineTxt):
"""Takes one String argument (lineTxt) and attempts to create a new ReferenceLine with this text, adding it to the last
place in the referenceLines list of a ReferenceSection object. Returns True if successful, False if not
"""
if type(lineTxt) is str or type(lineTxt) is unicode:
ln = ReferenceLine(lineTxt)
self._referenceLines.append(ln)
return True
else:
return False
def setContentLine(self, newContent):
"""Set the contents of the current line to that supplied in the 'newContent' argument. Return True on success, False on failure"""
if self.lineExists(self._lnPtr): return self._referenceLines[self._lnPtr].setContent(newContent)
else: return False
def lAppendLineText(self, appendStr):
"""Append text to the beginning of the ReferenceLine object currently pointed at by a ReferenceSection object"""
if self.lineExists(self._lnPtr): return self._referenceLines[self._lnPtr].lAppend(appendStr)
else: return False
def rAppendLineText(self, appendStr):
"""Append text to the end of the ReferenceLine object currently pointed at by a ReferenceSection object"""
if self.lineExists(self._lnPtr): return self._referenceLines[self._lnPtr].rAppend(appendStr)
else: return False
def isEmpty(self):
"""Return True if the reference section contains no reference lines, False if it does contain lines"""
return (len(self._referenceLines) < 1)
class ReferenceLine:
"""Concrete class representing an individual reference line as extracted from a document"""
def __init__(self, data=''):
"""Initialise a ReferenceLine's contents with the supplied String. If argument supplied is not a String, the ReferenceLine
object's contents will be initialised with a blank String
"""
if type(data) is str or type(data) is unicode: self._content = data
else: self._content = u''
def getContent(self):
"""Return a String version of a ReferenceLine's contents"""
return self._content
def display(self):
"""Display a ReferenceLine as a String on the standard output stream"""
print self._content.encode("utf-8")
def setContent(self, newContent=u''):
"""Set the content of a ReferenceLine to a new text String. Returns True if successful, False if not"""
if type(newContent) is str or type(newContent) is unicode:
self._content = newContent
return True
else:
return False
def rAppend(self, appendStr):
"""Append a text String to the end of a ReferenceLine object's textual content. Returns True if append successful, False if not"""
if type(appendStr) is str or type(appendStr) is unicode:
self._content=self._content + appendStr
return True
else:
return False
def lAppend(self, appendStr):
"""Append a text String to the beginning of a ReferenceLine objects textual content. Returns True if append successful False if not"""
if type(appendStr) is str or type(appendStr) is unicode:
self._content = appendStr+self._content
return True
else:
return False
class ReferenceSectionDisplayer:
def display(self, refsect, recid=None, myostream=sys.stdout):
if isinstance(refsect, ReferenceSection):
myostream.write("%s" % (self._rawReferebcesToString(refsect,recid).encode("utf-8"),))
myostream.flush()
elif isinstance(refsect, ProcessedReferenceSection):
myostream.write("%s" % (self._processedReferebcesToMARCXMLString(refsect,recid).encode("utf-8"),))
myostream.flush()
def _rawReferebcesToString(self,refsect,recid=None):
refstr = u""
if not refsect.isEmpty():
# Section Header
refstr += u"#################### START REFERENCE SECTION "
if recid is not None:
refstr += u"SYSID: '%s' " % (recid,)
refstr += u"####################\n"
for x in refsect:
refstr += x.getContent()+u"\n"
# Section Footer
refstr += u"#################### END REFERENCE SECTION ####################\n"
return refstr
def _processedReferebcesToMARCXMLString(self,refsect,recid=None):
refsectmainbody = refsect.getSelfMARCXML()
if len(refsectmainbody.strip()) > 0:
out = u""" <record>\n"""
if recid is not None and (type(recid) is unicode or type(recid) is str):
out += u""" <controlfield tag="001">"""+cgi.escape(recid)+u"""</controlfield>\n"""
out += refsectmainbody
out += u""" </record>\n"""
else:
out = u""
return out
class RegexWordSpacer:
"""Concrete Class. Adds optional regex space matchers and quantifiers (\s*?) between the characters of a word. Useful because sometimes
the document conversion process breaks up words with spaces
"""
def space(self, word):
"""Add the space chars to a word & return the regex pattern (not compiled)"""
newWord = None
if type(word) is str or type(word) is unicode:
newWord = u''
p_spc = re.compile(unicode(r'\s'),re.UNICODE)
for x in word:
m_spc = p_spc.match(x)
if m_spc is None:
newWord = newWord+x+unicode(r'\s*?')
else:
newWord = newWord+x
return newWord
class DocumentSearchPatternListCompiler:
"""Abstract class. Used to get a 'DocumentSearchCompiledPatternList' object, which is used for searching lines of a document for a
given pattern
"""
def getCompiledPatternList(self, prefix = u'', suffix = u''):
"""Return a list of compiled regex patterns"""
pass
def createPatterns(self, prefix = u'', suffix = u''):
"""Create the regex patterns (don't compile though)"""
pass
class RefSecnTitleListCompiler(DocumentSearchPatternListCompiler):
"""Concrete class. Used to return a 'DocumentSearchCompiledPatternList' object containing regex patterns enabling the identification of
possible reference section titles in a text line
"""
def getCompiledPatternList(self, prefix = u'', suffix = u''):
"""Return a list of compiled regex patterns used to ID reference section title"""
patterns = self.createPatterns()
return CompiledPatternList(patterns)
def createPatterns(self, prefix = u'', suffix = u''):
"""Create the regex patterns (don't compile though)"""
patternList = []
titles = self.getTitles()
sectMarker = unicode(r'^\s*?([\[\-\{\(])?\s*?((\w|\d){1,5}([\.\-\,](\w|\d){1,5})?\s*?[\.\-\}\)\]]\s*?)?(?P<title>')
lineEnd = unicode(r'(\s+?s\s*?e\s*?c\s*?t\s*?i\s*?o\s*?n\s*?)?)')
lineEnd = lineEnd+unicode(r'($|\s*?[\[\{\(\<]\s*?[1a-z]\s*?[\}\)\>\]]|\:)')
s = RegexWordSpacer()
for x in titles:
if (type(x) is str or type(x) is unicode) and len(x) > 1:
s = RegexWordSpacer()
namePtn = sectMarker+s.space(x)+lineEnd
patternList.append(namePtn)
elif (type(x) is str or type(x) is unicode) and len(x) > 0:
namePtn = sectMarker+s.space(x)+lineEnd
patternList.append(namePtn)
return patternList
def getTitles(self):
"""Get and return a list of the titles to be searched for"""
titles = []
titles.append(u'references')
titles.append(u'r\u00C9f\u00E9rences')
titles.append(u'r\u00C9f\u00C9rences')
titles.append(u'reference')
titles.append(u'refs')
titles.append(u'r\u00E9f\u00E9rence')
titles.append(u'r\u00C9f\u00C9rence')
titles.append(u'r\xb4ef\xb4erences')
titles.append(u'r\u00E9fs')
titles.append(u'r\u00C9fs')
titles.append(u'bibliography')
titles.append(u'bibliographie')
titles.append(u'citations')
return titles
class PostRefSecnTitleListCompiler(DocumentSearchPatternListCompiler):
"""Concrete class. Used to return a 'DocumentSearchCompiledPatternList' object containing regex patterns enabling the identification of
possible titles that usually follow the reference section in a doc
"""
def getCompiledPatternList(self, prefix = '', suffix = ''):
"""Return a list of compiled regex patterns used to ID post reference section title"""
patterns = self.createPatterns()
return CompiledPatternList(patterns)
def createPatterns(self, prefix = '', suffix = ''):
"""Create the regex patterns (don't compile though)"""
patterns = []
thead = unicode(r'^\s*?([\{\(\<\[]?\s*?(\w|\d)\s*?[\)\}\>\.\-\]]?\s*?)?')
ttail = unicode(r'(\s*?\:\s*?)?')
numatn = unicode(r'(\d+|\w\b|i{1,3}v?|vi{0,3})[\.\,]?\b')
s = RegexWordSpacer()
# Section titles:
patterns.append(thead+s.space(u'appendix')+ttail)
patterns.append(thead+s.space(u'appendices')+ttail)
patterns.append(thead+s.space(u'acknowledgement')+unicode(r's?')+ttail)
patterns.append(thead+s.space(u'table')+unicode(r'\w?s?\d?')+ttail)
patterns.append(thead+s.space(u'figure')+unicode(r's?')+ttail)
patterns.append(thead+s.space(u'annex')+unicode(r's?')+ttail)
patterns.append(thead+s.space(u'discussion')+unicode(r's?')+ttail)
patterns.append(thead+s.space(u'remercie')+unicode(r's?')+ttail)
# Figure nums:
patterns.append(r'^\s*?'+s.space(u'figure')+numatn)
patterns.append(r'^\s*?'+s.space(u'fig')+unicode(r'\.\s*?')+numatn)
patterns.append(r'^\s*?'+s.space(u'fig')+unicode(r'\.?\s*?\d\w?\b'))
# Table nums:
patterns.append(r'^\s*?'+s.space(u'table')+numatn)
patterns.append(r'^\s*?'+s.space(u'tab')+unicode(r'\.\s*?')+numatn)
patterns.append(r'^\s*?'+s.space(u'tab')+unicode(r'\.?\s*?\d\w?\b'))
return patterns
class PostRefSecnKWListCompiler(DocumentSearchPatternListCompiler):
"""Concrete class. Used to return a 'DocumentSearchCompiledPatternList' object containing regex patterns enabling the identification of
Key Words/phrases that are often found in lines following the reference section of a document
"""
def getCompiledPatternList(self, prefix = u'', suffix = u''):
"""Return a list of compiled regex patterns used to ID keywords usually found in lines after a reference section"""
patterns = self.createPatterns()
return CompiledPatternList(patterns)
def createPatterns(self, prefix = u'', suffix = u''):
"""Create the regex patterns (don't compile though)"""
patterns = []
s = RegexWordSpacer()
patterns.append(unicode(r'(')+s.space(u'prepared')+unicode(r'|')+s.space(u'created')+unicode(r').*?(AAS\s*?)?\sLATEX'))
patterns.append(unicode(r'AAS\s+?LATEX\s+?')+s.space(u'macros')+u'v')
patterns.append(unicode(r'^\s*?')+s.space(u'This paper has been produced using'))
patterns.append(unicode(r'^\s*?')+s.space(u'This article was processed by the author using Springer-Verlag')+u' LATEX')
return patterns
class FirstRefLineNumerationListCompiler(DocumentSearchPatternListCompiler):
"""Concrete class. Used to return a 'DocumentSearchCompiledPatternList' object containing regex patterns enabling the identification of
the first reference line by its numeration marker
"""
def getCompiledPatternList(self, prefix = u'', suffix = u''):
"""Return a list of compiled regex patterns used to ID the first reference line by its numeration marker"""
patterns = self.createPatterns()
return CompiledPatternList(patterns)
def createPatterns(self, prefix = u'', suffix = u''):
"""Create the regex patterns (don't compile though)"""
patterns = []
g_name = unicode(r'(?P<mark>')
g_close = u')'
patterns.append(g_name+unicode(r'(?P<left>\[)\s*?(?P<num>\d+)\s*?(?P<right>\])')+g_close)
patterns.append(g_name+unicode(r'(?P<left>\{)\s*?(?P<num>\d+)\s*?(?P<right>\})')+g_close)
return patterns
class RefLineNumerationListCompiler(DocumentSearchPatternListCompiler):
"""Concrete class. Used to return a 'DocumentSearchCompiledPatternList' object containing regex patterns enabling the ID of any reference
line by its numeration marker
"""
def getCompiledPatternList(self, prefix = u'', suffix = u''):
"""Return a list of compiled regex patterns used to ID the numeration marker for a reference line"""
patterns = self.createPatterns()
return CompiledPatternList(patterns)
def createPatterns(self, prefix = u'', suffix = u''):
"""Create the regex patterns (don't compile though)"""
patterns = []
if type(prefix) is str or type(prefix) is unicode:
title = prefix
else:
title = u''
g_name = unicode(r'(?P<mark>')
g_close = u')'
space = unicode(r'\s*?')
patterns.append(space+title+g_name+unicode(r'\[\s*?(?P<linenumber>\d+)\s*?\]')+g_close)
patterns.append(space+title+g_name+unicode(r'\[\s*?[a-zA-Z]+\s?(\d{1,4}[A-Za-z]?)?\s*?\]')+g_close)
patterns.append(space+title+g_name+unicode(r'\{\s*?\d+\s*?\}')+g_close)
patterns.append(space+title+g_name+unicode(r'\<\s*?\d+\s*?\>')+g_close)
patterns.append(space+title+g_name+unicode(r'\(\s*?\d+\s*?\)')+g_close)
patterns.append(space+title+g_name+unicode(r'(?P<marknum>\d+)\s*?\.')+g_close)
patterns.append(space+title+g_name+unicode(r'\d+\s*?')+g_close)
patterns.append(space+title+g_name+unicode(r'\d+\s*?\]')+g_close)
patterns.append(space+title+g_name+unicode(r'\d+\s*?\}')+g_close)
patterns.append(space+title+g_name+unicode(r'\d+\s*?\)')+g_close)
patterns.append(space+title+g_name+unicode(r'\d+\s*?\>')+g_close)
patterns.append(space+title+g_name+unicode(r'\[\s*?\]')+g_close)
patterns.append(space+title+g_name+unicode(r'\*')+g_close)
return patterns
class CompiledPatternList:
"""Concrete Class. List of compiled regex patterns, ready to be used for searching through text lines"""
class CompiledPatternListIterator:
def __init__(self, ptnlines):
self._mylist = ptnlines
self._listptr = 0
def next(self):
try:
item = self._mylist[self._listptr]
self._listptr += 1
return item
except IndexError:
raise StopIteration
def __init__(self, patternList):
"""Accept a list of regex strings and compile them, adding them to the internal list of compiled regex patterns"""
self._patterns = []
if type(patternList) is list:
for x in patternList:
self._patterns.append(re.compile(x, re.I|re.UNICODE))
def __iter__(self):
"""Return a CompiledPatternListIterator object so that the patterns held by a CompiledPatternList can be iterated through"""
return CompiledPatternList.CompiledPatternListIterator(self._patterns)
def getNumPatterns(self):
"""Return the length of the internal pattern list (patterns)"""
return len(self._patterns)
def getPattern(self, ptnIdx):
"""Return the regex pattern at [ptnIdx] in the internal pattern list (self._patterns). Returns 'None' if ptnIdx not valid"""
if type(ptnIdx) is int and ptnIdx < len(self._patterns) and ptnIdx > -1:
return self._patterns[ptnIdx]
else:
return None
def display(self):
"""Display all patterns held in a CompiledPatternList object"""
for x in self._patterns:
print x.pattern.encode("utf-8")
class LineSearchAlgorithm:
"""Search algorithm for matching a pattern in a line"""
def doSearch(self, searcher, line, patternList):
"""Search for a pattern in a line of text"""
match = None
unsafe = False
try: getNumPatterns=patternList.getNumPatterns
except AttributeError: unsafe=True
if (type(line) is str or type(line) is unicode) and not unsafe:
for x in patternList:
match = searcher.goSearch(line, x)
if match is not None:
break
return match
class SearchExecuter:
"""Abstract class. Executes a regex search operation on a line of text which is passed to it"""
def goSearch(self, line, pattern):
"""Execute the search and return a match object or None"""
pass
class MatchSearchExecuter(SearchExecuter):
"""Concrete class. Executes a 're.match()' on a compiled re pattern"""
def goSearch(self, line, pattern):
"""Execute the search and return a 'Match' object or None"""
return pattern.match(line)
class SearchSearchExecuter(SearchExecuter):
"""Concrete class. Executes a 're.search()' on a compiled re pattern"""
def goSearch(self, line, pattern):
"""Execute the search and return a 'Match' object or None"""
return pattern.search(line)
class LineSearcher:
"""Concrete Class. This is the interface through which the user can carry out a line search"""
def findAtStartLine(self, line, patternList):
"""Test a line of text against a list of patterns to see if any of the patterns match at the start of the line"""
al = LineSearchAlgorithm()
searcher = MatchSearchExecuter()
return al.doSearch(searcher, line, patternList)
def findWithinLine(self, line, patternList):
"""Test a line of text against a list of patterns to see if any of the patterns match anywhere within the line"""
al = LineSearchAlgorithm()
searcher = SearchSearchExecuter()
return al.doSearch(searcher, line, patternList)
class TextLineTransformer:
"""Abstract Class Interface. Accepts line, performs some transformationon it and returns transformed line"""
def processLine(self, line):
"""Carry out transformation on line. Return transformed line"""
pass
class EscapeSequenceTransformer(TextLineTransformer):
"""Class to correct escape seq's which were not properly represented in the document conversion"""
def __init__(self):
"""Compile & initialise pattern list"""
self._patterns = self._getPatterns()
def processLine(self, line):
"""Transform accents in a line into correct format"""
try:
for x in self._patterns.keys():
try:
line = line.replace(x,self._patterns[x])
except UnicodedecodeError:
sys.exit(0)
except TypeError:
pass
return line
def _getPatterns(self):
"""Return a list of regex patterns used to recognise escaped patterns"""
plist = {}
def _addLanguageTagCodePoints(ptnlist):
"""Add all language tag code points to remove from document"""
# Language Tag Code Points:
langTagCPs = [u"\U000E0000",u"\U000E0001",u"\U000E0002",u"\U000E0003",u"\U000E0004",u"\U000E0005",u"\U000E0006",u"\U000E0007",u"\U000E0008",u"\U000E0009",u"\U000E000A",u"\U000E000B",u"\U000E000C",u"\U000E000D",u"\U000E000E",u"\U000E000F",
u"\U000E0010",u"\U000E0011",u"\U000E0012",u"\U000E0013",u"\U000E0014",u"\U000E0015",u"\U000E0016",u"\U000E0017",u"\U000E0018",u"\U000E0019",u"\U000E001A",u"\U000E001B",u"\U000E001C",u"\U000E001D",u"\U000E001E",u"\U000E001F",
u"\U000E0020",u"\U000E0021",u"\U000E0022",u"\U000E0023",u"\U000E0024",u"\U000E0025",u"\U000E0026",u"\U000E0027",u"\U000E0028",u"\U000E0029",u"\U000E002A",u"\U000E002B",u"\U000E002C",u"\U000E002D",u"\U000E002E",u"\U000E002F",
u"\U000E0030",u"\U000E0031",u"\U000E0032",u"\U000E0033",u"\U000E0034",u"\U000E0035",u"\U000E0036",u"\U000E0037",u"\U000E0038",u"\U000E0039",u"\U000E003A",u"\U000E003B",u"\U000E003C",u"\U000E003D",u"\U000E003E",u"\U000E003F",
u"\U000E0040",u"\U000E0041",u"\U000E0042",u"\U000E0043",u"\U000E0044",u"\U000E0045",u"\U000E0046",u"\U000E0047",u"\U000E0048",u"\U000E0049",u"\U000E004A",u"\U000E004B",u"\U000E004C",u"\U000E004D",u"\U000E004E",u"\U000E004F",
u"\U000E0050",u"\U000E0051",u"\U000E0052",u"\U000E0053",u"\U000E0054",u"\U000E0055",u"\U000E0056",u"\U000E0057",u"\U000E0058",u"\U000E0059",u"\U000E005A",u"\U000E005B",u"\U000E005C",u"\U000E005D",u"\U000E005E",u"\U000E005F",
u"\U000E0060",u"\U000E0061",u"\U000E0062",u"\U000E0063",u"\U000E0064",u"\U000E0065",u"\U000E0066",u"\U000E0067",u"\U000E0068",u"\U000E0069",u"\U000E006A",u"\U000E006B",u"\U000E006C",u"\U000E006D",u"\U000E006E",u"\U000E006F",
u"\U000E0070",u"\U000E0071",u"\U000E0072",u"\U000E0073",u"\U000E0074",u"\U000E0075",u"\U000E0076",u"\U000E0077",u"\U000E0078",u"\U000E0079",u"\U000E007A",u"\U000E007B",u"\U000E007C",u"\U000E007D",u"\U000E007E",u"\U000E007F"]
for itm in langTagCPs: ptnlist[itm] = u""
def _addMusicNotation(ptnlist):
"""Add all musical notation items to remove from document"""
# Musical Notation Scoping
musicNotation = [u"\U0001D173",u"\U0001D174",u"\U0001D175",u"\U0001D176",u"\U0001D177",u"\U0001D178",u"\U0001D179",u"\U0001D17A"]
for itm in musicNotation: ptnlist[itm] = u""
# Control characters not suited to XML:
plist[u'\u2028'] = u""
plist[u'\u2029'] = u""
plist[u'\u202A'] = u""
plist[u'\u202B'] = u""
plist[u'\u202C'] = u""
plist[u'\u202D'] = u""
plist[u'\u202E'] = u""
plist[u'\u206A'] = u""
plist[u'\u206B'] = u""
plist[u'\u206C'] = u""
plist[u'\u206D'] = u""
plist[u'\u206E'] = u""
plist[u'\u206F'] = u""
plist[u'\uFFF9'] = u""
plist[u'\uFFFA'] = u""
plist[u'\uFFFB'] = u""
plist[u'\uFFFC'] = u""
plist[u'\uFEFF'] = u""
_addLanguageTagCodePoints(plist)
_addMusicNotation(plist)
plist[u'\u0001'] = u"" # START OF HEADING
# START OF TEXT & END OF TEXT:
plist[u'\u0002'] = u""
plist[u'\u0003'] = u""
plist[u'\u0004'] = u"" # END OF TRANSMISSION
# ENQ and ACK
plist[u'\u0005'] = u""
plist[u'\u0006'] = u""
plist[u'\u0007'] = u"" # BELL
plist[u'\u0008'] = u"" # BACKSPACE
# SHIFT-IN & SHIFT-OUT
plist[u'\u000E'] = u""
plist[u'\u000F'] = u""
# Other controls:
plist[u'\u0010'] = u"" # DATA LINK ESCAPE
plist[u'\u0011'] = u"" # DEVICE CONTROL ONE
plist[u'\u0012'] = u"" # DEVICE CONTROL TWO
plist[u'\u0013'] = u"" # DEVICE CONTROL THREE
plist[u'\u0014'] = u"" # DEVICE CONTROL FOUR
plist[u'\u0015'] = u"" # NEGATIVE ACK
plist[u'\u0016'] = u"" # SYNCRONOUS IDLE
plist[u'\u0017'] = u"" # END OF TRANSMISSION BLOCK
plist[u'\u0018'] = u"" # CANCEL
plist[u'\u0019'] = u"" # END OF MEDIUM
plist[u'\u001A'] = u"" # SUBSTITUTE
plist[u'\u001B'] = u"" # ESCAPE
plist[u'\u001C'] = u"" # INFORMATION SEPARATOR FOUR (file separator)
plist[u'\u001D'] = u"" # INFORMATION SEPARATOR THREE (group separator)
plist[u'\u001E'] = u"" # INFORMATION SEPARATOR TWO (record separator)
plist[u'\u001F'] = u"" # INFORMATION SEPARATOR ONE (unit separator)
# \r -> remove it
plist[u'\r'] = u""
# Strange parantheses - change for normal:
plist[u'\x1c'] = u'('
plist[u'\x1d'] = u')'
# Some ff from tex:
plist[u'\u0013\u0010'] = u'\u00ED'
plist[u'\x0b'] = u'ff'
# fi from tex:
plist[u'\x0c'] = u'fi'
# ligatures from TeX:
plist[u'\ufb00'] = u'ff'
plist[u'\ufb01'] = u'fi'
plist[u'\ufb02'] = u'fl'
plist[u'\ufb03'] = u'ffi'
plist[u'\ufb04'] = u'ffl'
# Superscripts from TeX
plist[u'\u2212'] = u'-'
plist[u'\u2013'] = u'-'
# Word style speech marks:
plist[u'\u201d'] = u'"'
plist[u'\u201c'] = u'"'
# pdftotext has problems with umlaut and prints it as diaeresis followed by a letter:correct it
# (Optional space between char and letter - fixes broken line examples)
plist[u'\u00A8 a'] = u'\u00E4'
plist[u'\u00A8 e'] = u'\u00EB'
plist[u'\u00A8 i'] = u'\u00EF'
plist[u'\u00A8 o'] = u'\u00F6'
plist[u'\u00A8 u'] = u'\u00FC'
plist[u'\u00A8 y'] = u'\u00FF'
plist[u'\u00A8 A'] = u'\u00C4'
plist[u'\u00A8 E'] = u'\u00CB'
plist[u'\u00A8 I'] = u'\u00CF'
plist[u'\u00A8 O'] = u'\u00D6'
plist[u'\u00A8 U'] = u'\u00DC'
plist[u'\u00A8 Y'] = u'\u0178'
plist[u'\xA8a'] = u'\u00E4'
plist[u'\xA8e'] = u'\u00EB'
plist[u'\xA8i'] = u'\u00EF'
plist[u'\xA8o'] = u'\u00F6'
plist[u'\xA8u'] = u'\u00FC'
plist[u'\xA8y'] = u'\u00FF'
plist[u'\xA8A'] = u'\u00C4'
plist[u'\xA8E'] = u'\u00CB'
plist[u'\xA8I'] = u'\u00CF'
plist[u'\xA8O'] = u'\u00D6'
plist[u'\xA8U'] = u'\u00DC'
plist[u'\xA8Y'] = u'\u0178'
# More umlaut mess to correct:
plist[u'\x7fa'] = u'\u00E4'
plist[u'\x7fe'] = u'\u00EB'
plist[u'\x7fi'] = u'\u00EF'
plist[u'\x7fo'] = u'\u00F6'
plist[u'\x7fu'] = u'\u00FC'
plist[u'\x7fy'] = u'\u00FF'
plist[u'\x7fA'] = u'\u00C4'
plist[u'\x7fE'] = u'\u00CB'
plist[u'\x7fI'] = u'\u00CF'
plist[u'\x7fO'] = u'\u00D6'
plist[u'\x7fU'] = u'\u00DC'
plist[u'\x7fY'] = u'\u0178'
plist[u'\x7f a'] = u'\u00E4'
plist[u'\x7f e'] = u'\u00EB'
plist[u'\x7f i'] = u'\u00EF'
plist[u'\x7f o'] = u'\u00F6'
plist[u'\x7f u'] = u'\u00FC'
plist[u'\x7f y'] = u'\u00FF'
plist[u'\x7f A'] = u'\u00C4'
plist[u'\x7f E'] = u'\u00CB'
plist[u'\x7f I'] = u'\u00CF'
plist[u'\x7f O'] = u'\u00D6'
plist[u'\x7f U'] = u'\u00DC'
plist[u'\x7f Y'] = u'\u0178'
# pdftotext: fix accute accent:
plist[u'\x13a'] = u'\u00E1'
plist[u'\x13e'] = u'\u00E9'
plist[u'\x13i'] = u'\u00ED'
plist[u'\x13o'] = u'\u00F3'
plist[u'\x13u'] = u'\u00FA'
plist[u'\x13y'] = u'\u00FD'
plist[u'\x13A'] = u'\u00C1'
plist[u'\x13E'] = u'\u00C9'
plist[u'\x13I'] = u'\u00CD'
plist[u'\x13O'] = u'\u00D3'
plist[u'\x13U'] = u'\u00DA'
plist[u'\x13Y'] = u'\u00DD'
plist[u'\x13 a'] = u'\u00E1'
plist[u'\x13 e'] = u'\u00E9'
plist[u'\x13 i'] = u'\u00ED'
plist[u'\x13 o'] = u'\u00F3'
plist[u'\x13 u'] = u'\u00FA'
plist[u'\x13 y'] = u'\u00FD'
plist[u'\x13 A'] = u'\u00C1'
plist[u'\x13 E'] = u'\u00C9'
plist[u'\x13 I'] = u'\u00CD'
plist[u'\x13 O'] = u'\u00D3'
plist[u'\x13 U'] = u'\u00DA'
plist[u'\x13 Y'] = u'\u00DD'
plist[u'\u00B4 a'] = u'\u00E1'
plist[u'\u00B4 e'] = u'\u00E9'
plist[u'\u00B4 i'] = u'\u00ED'
plist[u'\u00B4 o'] = u'\u00F3'
plist[u'\u00B4 u'] = u'\u00FA'
plist[u'\u00B4 y'] = u'\u00FD'
plist[u'\u00B4 A'] = u'\u00C1'
plist[u'\u00B4 E'] = u'\u00C9'
plist[u'\u00B4 I'] = u'\u00CD'
plist[u'\u00B4 O'] = u'\u00D3'
plist[u'\u00B4 U'] = u'\u00DA'
plist[u'\u00B4 Y'] = u'\u00DD'
plist[u'\u00B4a'] = u'\u00E1'
plist[u'\u00B4e'] = u'\u00E9'
plist[u'\u00B4i'] = u'\u00ED'
plist[u'\u00B4o'] = u'\u00F3'
plist[u'\u00B4u'] = u'\u00FA'
plist[u'\u00B4y'] = u'\u00FD'
plist[u'\u00B4A'] = u'\u00C1'
plist[u'\u00B4E'] = u'\u00C9'
plist[u'\u00B4I'] = u'\u00CD'
plist[u'\u00B4O'] = u'\u00D3'
plist[u'\u00B4U'] = u'\u00DA'
plist[u'\u00B4Y'] = u'\u00DD'
# pdftotext: fix grave accent:
plist[u'\u0060 a'] = u'\u00E0'
plist[u'\u0060 e'] = u'\u00E8'
plist[u'\u0060 i'] = u'\u00EC'
plist[u'\u0060 o'] = u'\u00F2'
plist[u'\u0060 u'] = u'\u00F9'
plist[u'\u0060 A'] = u'\u00C0'
plist[u'\u0060 E'] = u'\u00C8'
plist[u'\u0060 I'] = u'\u00CC'
plist[u'\u0060 O'] = u'\u00D2'
plist[u'\u0060 U'] = u'\u00D9'
plist[u'\u0060a'] = u'\u00E0'
plist[u'\u0060e'] = u'\u00E8'
plist[u'\u0060i'] = u'\u00EC'
plist[u'\u0060o'] = u'\u00F2'
plist[u'\u0060u'] = u'\u00F9'
plist[u'\u0060A'] = u'\u00C0'
plist[u'\u0060E'] = u'\u00C8'
plist[u'\u0060I'] = u'\u00CC'
plist[u'\u0060O'] = u'\u00D2'
plist[u'\u0060U'] = u'\u00D9'
# \02C7 : caron
plist[u'\u02C7C'] = u'\u010C'
plist[u'\u02C7c'] = u'\u010D'
plist[u'\u02C7S'] = u'\u0160'
plist[u'\u02C7s'] = u'\u0161'
plist[u'\u02C7Z'] = u'\u017D'
plist[u'\u02C7z'] = u'\u017E'
# \027 : aa (a with ring above)
plist[u'\u02DAa'] = u'\u00E5'
plist[u'\u02DAA'] = u'\u00C5'
# \030 : cedilla
plist[u'\u0327c'] = u'\u00E7'
plist[u'\u0327C'] = u'\u00C7'
return plist
class URLRepairer(TextLineTransformer):
"""Class to attempt to re-assemble URLs which have been broken during the document's conversion to text"""
def __init__(self):
"""Initialise the URI correction pattern list"""
self._patterns = self._compilePatterns(self._getPatterns())
def processLine(self, line):
"""Repair any broken URLs in line and return newly repaired line"""
def chop_spaces(m):
chopper = SpaceNullifier()
line = m.group(1)
return chopper.processLine(line)
if type(line) is str or type(line) is unicode:
for x in self._patterns:
line = x.sub(chop_spaces, line)
return line
def _getPatterns(self):
"""Return a list regex patterns and corrective measures to be used when broken URLs are encountered in a line"""
fileTypesList = []
fileTypesList.append(unicode(r'h\s*?t\s*?m')) # htm
fileTypesList.append(unicode(r'h\s*?t\s*?m\s*?l')) # html
fileTypesList.append(unicode(r't\s*?x\s*?t')) # txt
fileTypesList.append(unicode(r'p\s*?h\s*?p')) # php
fileTypesList.append(unicode(r'a\s*?s\s*?p\s*?')) # asp
fileTypesList.append(unicode(r'j\s*?s\s*?p')) # jsp
fileTypesList.append(unicode(r'p\s*?y')) # py (python)
fileTypesList.append(unicode(r'p\s*?l')) # pl (perl)
fileTypesList.append(unicode(r'x\s*?m\s*?l')) # xml
fileTypesList.append(unicode(r'j\s*?p\s*?g')) # jpg
fileTypesList.append(unicode(r'g\s*?i\s*?f')) # gif
fileTypesList.append(unicode(r'm\s*?o\s*?v')) # mov
fileTypesList.append(unicode(r's\s*?w\s*?f')) # swf
fileTypesList.append(unicode(r'p\s*?d\s*?f')) # pdf
fileTypesList.append(unicode(r'p\s*?s')) # ps
fileTypesList.append(unicode(r'd\s*?o\s*?c')) # doc
fileTypesList.append(unicode(r't\s*?e\s*?x')) # tex
fileTypesList.append(unicode(r's\s*?h\s*?t\s*?m\s*?l')) # shtml
plist = []
plist.append(unicode(r'(h\s*t\s*t\s*p\s*\:\s*\/\s*\/)'))
plist.append(unicode(r'(f\s*t\s*p\s*\:\s*\/\s*\/\s*)'))
plist.append(unicode(r'((http|ftp):\/\/\s*[\w\d])'))
plist.append(unicode(r'((http|ftp):\/\/([\w\d\s\._\-])+?\s*\/)'))
plist.append(unicode(r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)+)'))
plist.append(unicode(r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)*([\w\d\_\s\-]+\.\s?[\w\d]+))'))
# some possible endings for URLs:
for x in fileTypesList:
plist.append(unicode(r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*([\w\d\_\-]+\.') + x + u'))')
# if url last thing in line, and only 10 letters max, concat them
plist.append(unicode(r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*'\
r'\s*?([\w\d\_\.\-]\s?){1,10}\s*)$'))
return plist
def _compilePatterns(self, plist):
"""Compile regex patterns. Return mapping object containing patterns and replacement strings for each pattern"""
ptns = []
for x in plist:
ptns.append(re.compile(x, re.I+re.UNICODE))
return ptns
class SpaceNullifier(TextLineTransformer):
"""Class to remove all spaces from a text string"""
def __init__(self):
"""Initialise space chopping pattern"""
self.ptn = re.compile(unicode(r'\s+'), re.UNICODE)
self.rep = u''
def processLine(self, line):
"""Perform the act of chopping spaces from a line. Return line with no spaces in it"""
newLine = line
if type(newLine) is str or type(newLine) is unicode:
newLine = self.ptn.sub(self.rep, line)
return newLine
class MultispaceTruncator(TextLineTransformer):
"""Class to transform multiple spaces into a single space"""
def __init__(self):
"""Initialise space detection pattern"""
self.ptn = re.compile(unicode(r'\s{2,}'), re.UNICODE)
self.rep = u' '
def processLine(self, line):
"""Perform the act of detecting and replacing multiple spaces"""
newLine = line
if type(newLine) is str or type(newLine) is unicode:
newLine = self.ptn.sub(self.rep, line)
return newLine
class Document:
"""Abstract class Representing a fulltext document in the system"""
def __init__(self, newDocBody = [], filepath = None):
"""Initialise state of a document object"""
self._content = []
if filepath is not None:
self._file_readlines(filepath)
elif type(newDocBody) is list or type(newDocBody) is str or type(newDocBody) is unicode:
self.appendData(newDocBody)
def _file_readlines(self, fname):
try:
fh=open("%s" % (fname,), "r")
for line in fh: self._content.append(line.decode("utf-8"))
fh.close()
except IOError:
sys.stderr.write("""E: Failed to read in file "%s".\n""" % (fname,))
except ValueError:
sys.stderr.write("""E: Failed to read in file "%s".\n""" % (fname,))
def displayDocument(self):
"""Abstract: Display the Document"""
pass
def appendData(self, newData):
"""Add a text line to a TextDocument object"""
if type(newData) is list:
for line in newData:
self._content.append(line)
elif type(newData) is str or type(newData) is unicode:
self._content.append(newData)
def isEmpty(self):
"""Return 1 if self._content is empty; 0 if not"""
return (len(self._content) < 1)
class TextDocument(Document):
"""Concrete class representing a TextDocument - effectively a list of Strings of plaintext"""
def __init__(self, newDocBody = [], filepath = None):
"""Initialise a TextDocument object"""
Document.__init__(self, newDocBody, filepath)
def displayDocument(self):
for i in self._content: print i.encode("utf-8")
def getReferences(self, start, end):
"""Get the reference section lines, put them into a ReferenceSectionRebuilder object, ask it to rebuild the
lines, and return the resulting ReferenceSection object
"""
startIdx = None
if start.firstLineIsTitleAndMarker():
# Title on same line as 1st ref- take title out!
t = start.getTitleString()
startIdx = start.getLineNum()
newline = None
sp = re.compile(unicode(r'^.*?')+t,re.UNICODE)
newl = sp.split(self._content[startIdx],1)
self._content[startIdx] = newl[1]
elif start.titlePresent():
# Pass title
startIdx = start.getLineNum()+1
else:
startIdx = start.getLineNum()
if type(end) is int:
b = ReferenceSectionRebuilder(self._content[startIdx:end+1])
else:
b = ReferenceSectionRebuilder()
return b.getRebuiltLines(start)
def findEndReferenceSection(self, refStart):
"""Find the line number of the end of a TextDocument's reference section. Should be passed a ReferenceSectionStartPoint
object containing at least the start line of the reference section. Returns the reference section end line number
details if success, None if not
"""
if refStart is None or refStart.getLineNum() is None:
# No reference section start info!
return None
sectEnded = 0
x = refStart.getLineNum()
if (type(x) is not int) or (x<0) or (x>len(self._content)) or (len(self._content)<1):
# Cant safely find end of refs with this info - quit!
return None
# Get line test patterns:
t_patterns = PostRefSecnTitleListCompiler().getCompiledPatternList()
kw_patterns = PostRefSecnKWListCompiler().getCompiledPatternList()
if refStart.markerCharPresent():
mk_patterns = CompiledPatternList([refStart.getMarkerPattern()])
else:
mk_patterns = RefLineNumerationListCompiler().getCompiledPatternList()
garbageDigit_pattern = re.compile(unicode(r'^\s*?([\+\-]?\d+?(\.\d+)?\s*?)+?\s*?$'),re.UNICODE)
searcher=LineSearcher()
while (x<len(self._content)) and (not sectEnded):
end_match = searcher.findWithinLine(self._content[x], t_patterns)
if end_match is None:
end_match = searcher.findWithinLine(self._content[x], kw_patterns)
if end_match is not None:
# End reference section? Check within next 5 lines for other reference numeration markers
y = x+1
lineFnd = 0
while (y<x+6) and (y<len(self._content)) and (not lineFnd):
num_match=searcher.findWithinLine(self._content[y], mk_patterns)
if num_match is not None and not num_match.group(0).isdigit():
lineFnd = 1
y = y + 1
if not lineFnd:
# No ref line found-end section
sectEnded = 1
if not sectEnded:
# Does this & the next 5 lines simply contain numbers? If yes, it's probably the axis
# scale of a graph in a fig. End refs section
dm = garbageDigit_pattern.match(self._content[x])
if dm is not None:
y = x + 1
digitLines = 4
numDigitLns = 1
while(y<x+digitLines) and (y<len(self._content)):
dm = garbageDigit_pattern.match(self._content[y])
if dm is not None:
numDigitLns = numDigitLns + 1
y = y + 1
if numDigitLns == digitLines:
sectEnded = 1
x = x + 1
return x - 1
def extractReferences(self,no_rebuild = False):
"""Extract references from a TextDocument and return a ReferenceSection object"""
# Try to remove pagebreaks, headers, footers
self._removePageBoundaryInformation()
# Find start of refs section:
sectStart = self.findReferenceSection()
if sectStart is None:
# No references found
sectStart = self.findReferenceSectionNoTitle()
if sectStart is None:
# No References
refs = ReferenceSection()
else:
sectEnd = self.findEndReferenceSection(sectStart)
if sectEnd is None:
# No End to refs? Not safe to extract
refs = ReferenceSection()
else:
# Extract
refs = self.getReferences(sectStart, sectEnd)
return refs
def findReferenceSectionNoTitle(self):
"""Find the line number of the start of a TextDocument object's reference section by searching for the first reference
line. Can only find reference sections with distinct line markers such as [1]. Returns a ReferenceSectionStartPoint
object containing ref start line number & marker char, or the None type if nothing found
"""
refStartLine = refLineMarker = refStart = None
if len(self._content) > 0:
mk_patterns = FirstRefLineNumerationListCompiler().getCompiledPatternList()
searcher = LineSearcher()
p_blank = re.compile(unicode(r'^\s*$'))
x = len(self._content)-1
foundSect = 0
while x >= 0 and not foundSect:
mk_match = searcher.findAtStartLine(self._content[x], mk_patterns)
if mk_match is not None and string.atoi(mk_match.group('num')) == 1:
# Get mark recognition pattern:
mk_ptn = mk_match.re.pattern
# Look for [2] in next 10 lines:
nxtTestLines = 10
y = x + 1
tmpFnd = 0
while y < len(self._content) and y < x+nxtTestLines and not tmpFnd:
mk_match2=searcher.findAtStartLine(self._content[y], mk_patterns)
if (mk_match2 is not None) and (string.atoi(mk_match2.group('num')) == 2) and (mk_match.group('left') == mk_match2.group('left')) and (mk_match.group('right') == mk_match2.group('right')):
# Found next line:
tmpFnd = 1
elif y == len(self._content) - 1:
tmpFnd = 1
y = y + 1
if tmpFnd:
foundSect = 1
refStartLine = x
refLineMarker = mk_match.group('mark')
refLineMarkerPattern = mk_ptn
x = x - 1
if refStartLine is not None:
# Make ReferenceSectionStartPoint object with ref section start location details
refStart = ReferenceSectionStartPoint()
refStart.setLineNum(refStartLine)
refStart.setMarkerChar(refLineMarker)
refStart.setMarkerPattern(refLineMarkerPattern)
return refStart
def findReferenceSection(self):
"""Find the line number of the start of a TextDocument object's reference section. Returns a 'ReferenceSectionStartPoint'
object containing details of the reference section start line number, the reference section title & the marker char
used for each reference line or returns None if not found
"""
refStartLine = refTitle = refLineMarker = refLineMarkerPattern = None
refStart = titleMarkerSameLine = foundPart = None
if len(self._content) > 0:
t_patterns = RefSecnTitleListCompiler().getCompiledPatternList()
mk_patterns = RefLineNumerationListCompiler().getCompiledPatternList()
searcher = LineSearcher()
p_blank = re.compile(unicode(r'^\s*$'))
# Try to find refs section title:
x = len(self._content)-1
foundTitle = 0
while x >= 0 and not foundTitle:
title_match = searcher.findWithinLine(self._content[x], t_patterns)
if title_match is not None:
temp_refStartLine = x
tempTitle = title_match.group('title')
mk_wtitle_ptrns = RefLineNumerationListCompiler().getCompiledPatternList(tempTitle)
mk_wtitle_match = searcher.findWithinLine(self._content[x], mk_wtitle_ptrns)
if mk_wtitle_match is not None:
mk = mk_wtitle_match.group('mark')
mk_ptn = mk_wtitle_match.re.pattern
p_num = re.compile(unicode(r'(\d+)'))
m_num = p_num.search(mk)
if m_num is not None and string.atoi(m_num.group(0)) == 1:
# Mark found.
foundTitle = 1
refTitle = tempTitle
refLineMarker = mk
refLineMarkerPattern = mk_ptn
refStartLine=temp_refStartLine
titleMarkerSameLine = 1
else:
foundPart = 1
refStartLine = temp_refStartLine
refLineMarker = mk
refLineMarkerPattern = mk_ptn
refTitle = tempTitle
titleMarkerSameLine = 1
else:
try:
y = x + 1
# Move past blank lines
m_blank = p_blank.match(self._content[y])
while m_blank is not None and y < len(self._content):
y = y+1
m_blank = p_blank.match(self._content[y])
# Is this line numerated like a reference line?
mark_match = searcher.findAtStartLine(self._content[y], mk_patterns)
if mark_match is not None:
# Ref line found. What is it?
titleMarkerSameLine=None
mark = mark_match.group('mark')
mk_ptn = mark_match.re.pattern
p_num = re.compile(unicode(r'(\d+)'))
m_num = p_num.search(mark)
if m_num is not None and string.atoi(m_num.group(0)) == 1:
# 1st ref truly found
refStartLine = temp_refStartLine
refLineMarker = mark
refLineMarkerPattern = mk_ptn
refTitle = tempTitle
foundTitle = 1
elif m_num is not None and m_num.groups(0) != 1:
foundPart = 1
refStartLine = temp_refStartLine
refLineMarker = mark
refLineMarkerPattern = mk_ptn
refTitle = tempTitle
else:
if foundPart:
foundTitle = 1
else:
foundPart = 1
refStartLine = temp_refStartLine
refTitle=tempTitle
refLineMarker = mark
refLineMarkerPattern = mk_ptn
else:
# No numeration
if foundPart:
foundTitle = 1
else:
foundPart = 1
refStartLine = temp_refStartLine
refTitle=tempTitle
except IndexError:
# References section title was on last line for some reason. Ignore
pass
x = x - 1
if refStartLine is not None:
# Make ReferenceSectionStartPoint object with ref
# section start location details
refStart = ReferenceSectionStartPoint()
refStart.setLineNum(refStartLine)
refStart.setTitleString(refTitle)
refStart.setMarkerChar(refLineMarker)
refStart.setMarkerPattern(refLineMarkerPattern)
if titleMarkerSameLine is not None:
refStart.setTitleMarkerSameLine()
return refStart
def _removePageBoundaryInformation(self):
"""Locate page breaks, headers and footers within the doc body. remove them when found"""
numHeadLn = numFootLn = 0
pageBreaks = []
# Make sure document not just full of whitespace:
if not self.documentContainsText():
return 0
# Get list of index posns of pagebreaks in document:
pageBreaks = self.getDocPageBreakPositions()
# Get num lines making up each header if poss:
numHeadLn = self.getHeadLines(pageBreaks)
# Get num lines making up each footer if poss:
numFootLn = self.getFootLines(pageBreaks)
# Remove pagebreaks,headers,footers:
self.chopHeadFootBreaks(pageBreaks, numHeadLn, numFootLn)
def getheadFootWordPattern(self):
"""Regex pattern used to ID a word in a header/footer line"""
return re.compile(unicode(r'([A-Za-z0-9-]+)'),re.UNICODE)
def getHeadLines(self, breakIndices = []):
"""Using list of indices of pagebreaks in document, attempt to determine how many lines page headers consist of"""
remainingBreaks = (len(breakIndices) - 1)
numHeadLns = emptyLine = 0
p_wordSearch = self.getheadFootWordPattern()
if remainingBreaks > 2:
if remainingBreaks > 3:
# Only check odd page headers
nxtHead = 2
else:
# Check headers on each page
nxtHead = 1
keepChecking = True
while keepChecking:
curBreak = 1
#m_blankLineTest = p_wordSearch.search(self._content[(breakIndices[curBreak]+numHeadLns+1)])
m_blankLineTest = re.compile(u'\S',re.UNICODE).search(self._content[(breakIndices[curBreak]+numHeadLns+1)])
if m_blankLineTest == None:
# Empty line in header:
emptyLine = 1
if (breakIndices[curBreak]+numHeadLns+1) == (breakIndices[(curBreak + 1)]):
# Have reached next pagebreak: document has no body - only head/footers!
keepChecking = False
grps_headLineWords = p_wordSearch.findall(self._content[(breakIndices[curBreak]+numHeadLns+1)])
curBreak = curBreak + nxtHead
while (curBreak < remainingBreaks) and keepChecking:
grps_thisLineWords = p_wordSearch.findall(self._content[(breakIndices[curBreak]+numHeadLns+1)])
if emptyLine:
if len(grps_thisLineWords) != 0:
# This line should be empty, but isnt
keepChecking = False
else:
if (len(grps_thisLineWords) == 0) or (len(grps_headLineWords) != len(grps_thisLineWords)):
# Not same num 'words' as equivilent line in 1st header:
keepChecking = False
else:
keepChecking = self.checkBoundaryLinesSimilar(grps_headLineWords, grps_thisLineWords)
# Update curBreak for nxt line to check
curBreak = curBreak + nxtHead
if keepChecking:
# Line is a header line: check next
numHeadLns = numHeadLns+1
emptyLine = 0
return numHeadLns
def getFootLines(self, breakIndices = []):
"""Using list of indices of pagebreaks in document, attempt to determine how many lines page footers consist of"""
numBreaks = (len(breakIndices))
numFootLns = 0
emptyLine = 0
keepChecking = 1
p_wordSearch = self.getheadFootWordPattern()
if numBreaks > 2:
while keepChecking:
curBreak = 1
#m_blankLineTest = p_wordSearch.match(self._content[(breakIndices[curBreak]-numFootLns-1)])
m_blankLineTest = re.compile(u'\S',re.UNICODE).search(self._content[(breakIndices[curBreak] - numFootLns - 1)])
if m_blankLineTest == None:
emptyLine = 1
grps_headLineWords = p_wordSearch.findall(self._content[(breakIndices[curBreak]-numFootLns-1)])
curBreak=curBreak + 1
while (curBreak < numBreaks) and keepChecking:
grps_thisLineWords = p_wordSearch.findall(self._content[(breakIndices[curBreak] - numFootLns - 1)])
if emptyLine:
if len(grps_thisLineWords) != 0:
keepChecking = 0
else:
if (len(grps_thisLineWords) == 0) or (len(grps_headLineWords) != len(grps_thisLineWords)):
keepChecking = 0
else:
keepChecking = self.checkBoundaryLinesSimilar(grps_headLineWords, grps_thisLineWords)
curBreak = curBreak + 1
if keepChecking:
numFootLns = numFootLns+1
emptyLine = 0
return numFootLns
def chopHeadFootBreaks(self, breakIndices = [], headLn = 0, footLn = 0):
"""Remove document lines containing breaks, headers, footers"""
numBreaks = len(breakIndices)
pageLens = []
for x in range(0,numBreaks):
if x < numBreaks - 1:
pageLens.append(breakIndices[x + 1] - breakIndices[x])
pageLens.sort()
if (len(pageLens) > 0) and (headLn+footLn+1 < pageLens[0]):
# Safe to chop hdrs & ftrs
breakIndices.reverse()
first = 1
for i in range(0, len(breakIndices)):
# Unless this is the last page break, chop headers
if not first:
for j in range(1,headLn+1):
self._content[breakIndices[i]+1:breakIndices[i]+2] = []
else:
first = 0
# Chop page break itself
self._content[breakIndices[i]:breakIndices[i]+1] = []
# Chop footers (unless this is the first page break)
if i != len(breakIndices) - 1:
for k in range(1,footLn + 1):
self._content[breakIndices[i] - footLn:breakIndices[i] - footLn + 1] = []
def checkBoundaryLinesSimilar(self, l_1, l_2):
"""Compare two lists to see if their elements are roughly the same"""
numMatches = 0
if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
return False
p_int = re.compile(unicode(r'^(\d+)$'))
for i in range(0,len(l_1)):
m_int1 = p_int.match(l_1[i])
m_int2 = p_int.match(l_2[i])
if(m_int1 != None) and (m_int2 != None):
numMatches=numMatches+1
else:
l1_str = l_1[i].lower()
l2_str = l_2[i].lower()
if (l1_str[0] == l2_str[0]) and (l1_str[len(l1_str) - 1] == l2_str[len(l2_str) - 1]):
numMatches=numMatches+1
if (len(l_1) == 0) or (float(numMatches)/float(len(l_1)) < 0.9):
return False
else:
return True
def getDocPageBreakPositions(self):
"""Locate page breaks in the list of document lines and make a list of their indices to be returned"""
pageBreaks = []
p_break = re.compile(unicode(r'^\s*?\f\s*?$'),re.UNICODE)
for i in range(len(self._content)):
if p_break.match(self._content[i]) != None:
pageBreaks.append(i)
return pageBreaks
def documentContainsText(self):
"""Test whether document contains text, or is just full of worthless whitespace. Return 1 if has text, 0 if not"""
foundWord = False
p_word = re.compile(unicode(r'\S+'))
for i in self._content:
if p_word.match(i) != None:
foundWord = True
break
return foundWord
class Ps2asciiEncodedTextDocument(Document):
"""Text document that is encoded with PS coordinate information. This type of document is result of a ps2ascii conversion"""
class Ps2asciiOutputLine:
"""Represents a line from a ps2ascii conversion"""
def __init__(self, posx, posy, content, diffx):
"""Initialise a dataline's state"""
self._posnX = self._posnY = 0
self._dataContent = ''
self._diff_posnX = 0
self.setPosX(int(posx))
self.setPosY(int(posy))
self.setText(content)
self.setDiffPosX(int(diffx))
def setPosX(self, x):
"""Set posnX value for a Ps2asciiOutputLine object"""
self._posnX = x
def setPosY(self, y):
"""Set posnY value for a Ps2asciiOutputLine object"""
self._posnY = y
def setText(self, data):
"""Set dataContent value for Ps2asciiOutputLine object"""
self._dataContent = data
def setDiffPosX(self, dpx):
"""Set diff_posnX value for a Ps2asciiOutputLine object"""
self._diff_posnX = dpx
def getPosX(self):
"""Return the posnX value for a Ps2asciiOutputLine object"""
return self._posnX
def getPosY(self):
"""Return the posnY value for a Ps2asciiOutputLine object"""
return self._posnY
def getText(self):
"""Return a cleaned up version of the dataContent in this Ps2asciiOutputLine object"""
return self._dataContent
def getDiffPosX(self):
"""Return the diff_posnX value for a Ps2asciiOutputLine object"""
return self._diff_posnX
def isNewLine(self, previousLine):
"""Check the positional coordinates of this line with those of the supplied Ps2asciiOutputLine object to
determine whether this is a new line. Return 1 if yes, or 0 if no
"""
if (self.getPosX() <= previousLine.getPosX()) and (self.getPosY() != previousLine.getPosY()):
return 1
else:
return 0
def isSpaceSeparated(self, posnxEst):
"""Return 1 if the text in this Ps2asciiOutputLine object should be separated from that in a
previous Ps2asciiOutputLine object, as determined by an X position estimate (posnxEst). Return 0 if not
"""
if (self.getPosX() > (posnxEst + 7)):
return 1
else:
return 0
def __init__(self, newDocBody = []):
Document.__init__(self, newDocBody)
def convertToPlainText(self):
"""Tell a Ps2asciiEncodedTextDocument to convert itself to convert itself to pure plaintext. Returns TextDocument object"""
# Converted document:
plaintextContent = []
tempLine = ''
# Fictitious old line to compare with 1st line:
oldRawLine = self.Ps2asciiOutputLine(9999,9999,"",0)
posnxEst = 9999
for line in self._content:
curRawLine = self.getDataLine(line)
if curRawLine != None:
# Find out if this a new line or a continuation of the last line
if curRawLine.isNewLine(oldRawLine):
# Append previous full line:
plaintextContent.append(self.prepareLineForAppending(tempLine))
# Start a new line buffer:
tempLine = curRawLine.getText()
else:
# Not new line: concat with last line
if curRawLine.isSpaceSeparated(posnxEst):
tempLine = tempLine+' '+curRawLine.getText()
else:
tempLine = tempLine+curRawLine.getText()
posnxEst = (curRawLine.getPosX() + curRawLine.getDiffPosX())
oldRawLine = curRawLine
# Append very last line to list:
plaintextContent.append(self.prepareLineForAppending(tempLine))
# Remove first, empty cell from list:
plaintextContent[0:1] = []
# Make a TextDocument with the newly converted text content and return it:
return TextDocument(plaintextContent)
def getDataLine(self, rawLine):
"""Take a raw line from ps2ascii, and put its components into a Ps2asciiOutputLine object"""
idPattern = re.compile(r'^S\s(?P<posnX>\d+)\s(?P<posnY>\d+)\s\((?P<content>.*)\)\s(?P<diff_posnX>\d+)$')
match = idPattern.search(rawLine)
if match != None:
return self.Ps2asciiOutputLine(match.group('posnX'), match.group('posnY'), match.group('content'), match.group('diff_posnX'))
else:
return None
def prepareLineForAppending(self, line):
"""Prepare the contents of a plaintext line which has been rebuilt from Ps2asciiOutputLine(s) to be appended to the
list of plaintext lines which make up the plaintext document Test its contents: if all whitespace, but not formfeed,
return an empty line; if contains non-whitespace or a formfeed, return the line as is
"""
# Clean line to append of control codes:
line = self.cleanLine(line)
ep = re.compile('\S')
em = ep.match(line)
if em == None:
fp = re.compile('^ *\f *$')
fm = fp.match(line)
if fm == None:
line = ''
return line
def cleanLine(self, line):
"""Clean a line of text of the messy character codes that ps2ascii adds during conversion"""
# Correct escaped parentheses
p = re.compile(r'\\\(')
line = p.sub('(', line)
p = re.compile(r'\\\)')
line = p.sub(r')', line)
# Correct special symbols
p = re.compile(r'\\\\')
line = p.sub('', line)
p = re.compile('\n')
line = p.sub(r' ', line)
# Change '\013' to 'ff' (ps2ascii messes this up)
p = re.compile(r'\\013')
line = p.sub('ff', line)
# Change '\017' (bullet point) into '*'
p = re.compile(r'\\017')
line = p.sub('*', line)
# Change '\003' into '*'
p = re.compile(r'\\003')
line = p.sub('', line)
# Change '\\f' to 'fi' (ps2ascii messes this up)
p = re.compile(r'\\f')
line = p.sub('fi', line)
# Remove page numbers:
p = re.compile('\{\s\d+\s\{')
line = p.sub(r'', line)
# Correct Hyphens:
p = re.compile('\{')
line = p.sub('-', line)
return line
def displayDocument(self):
"""Let Ps2asciiEncodedTextDocument display itself on standard output stream"""
for i in self._content:
print i
class ReferenceSectionStartPoint:
"""Concrete class to hold information about the start line of a document's reference section (e.g. line number, title, etc)"""
def __init__(self):
self._lineNum = self._title = self._lineMarkerPresent = None
self._haveMarkerRegex = self._markerChar = self._markerRegexPattern = self._markerTitleSameLine=None
def setLineNum(self, num):
"""Set the line number of the references section start"""
self._lineNum = num
def setTitleString(self, t):
"""Set the title string for the references section start"""
self._title = t
def setMarkerChar(self, m):
"""Set the marker char for the references section start"""
if m is not None and (type(m) is str or type(m) is unicode):
self._markerChar = m
self._lineMarkerPresent = 1
else:
self._markerChar = None
self._lineMarkerPresent = 0
def setMarkerPattern(self, p):
"""Set the regex pattern for the start of the first reference line"""
if p is not None and (type(p) is str or type(p) is unicode):
self._markerRegexPattern = p
self._haveMarkerRegex = 1
else:
self._markerRegexPattern = None
self._haveMarkerRegex = 0
def setTitleMarkerSameLine(self):
"""Set a flag to say that the first reference line contains both a title and the first line"""
self._markerTitleSameLine = 1
def getLineNum(self):
"""Return the line number of the references section start"""
return self._lineNum
def getTitleString(self):
"""Return the title string for the references section start if there is one, else it will be None"""
return self._title
def firstLineIsTitleAndMarker(self):
"""Return 1 if the first reference line contains both reference section title & first line numeration marker"""
if self._markerTitleSameLine is not None:
return True
else:
return False
def titlePresent(self):
"""Return 1 if there is a title present in the first reference line, 0 if not"""
if self._title is not None:
return True
else:
return False
def markerCharPresent(self):
"""Return 1 if there is a marker char, 0 if not"""
if self._lineMarkerPresent:
return True
else:
return False
def markerPatternPresent(self):
"""Return 1 if there is a marker regex pattern, 0 if not"""
if self._haveMarkerRegex:
return True
else:
return False
def getMarkerChar(self):
"""Return the marker char for the reference section start if there is one, else it will be None"""
return self._markerChar
def getMarkerPattern(self):
return self._markerRegexPattern
class ReferenceSectionRebuilder:
"""Concrete class whose job is to rebuild broken reference lines. Contains a list of Strings. Each String in this list
represents the contents of either a complete reference line or part of a reference line. When a document is converted from
its original format to plaintext, lines are often broken because the converter cant distinguish between wrapped lines and
new lines. Objects of this class can be used to try to rebuild broken reference lines and create a 'ReferenceSection' object
"""
def __init__(self, lines = []):
"""Initialise a ReferenceSectionRebuilder object with a list of 'broken' reference lines"""
if type(lines) is list:
self._dataLines = lines
elif type(lines) is str or type(lines) is unicode:
self._dataLines.append(lines)
else:
self._dataLines = []
def getRebuiltLines(self, refStartInfo):
"""Trigger reference lines rebuilding process & return ReferenceSection object containing rebuilt ReferenceLine objects"""
# Ensure we have a real 'ReferenceSectionStartPoint'
try: getLineNum = refStartInfo.getLineNum
except AttributeError: return ReferenceSection()
self._removeLeadingGarbageLines()
numatnInfo = self._getLineNumerationStyle(refStartInfo)
return ReferenceSection(self._rebuild(numatnInfo))
def _testBlankLineRefSeparators(self):
"""Test to see if reference lines are separated by blank lines so that these can be used to rebuild reference lines"""
p_ws = re.compile(unicode(r'^\s*$'),re.UNICODE)
numblank = 0 # No blank lines fnd between non-blanks
numline = 0 # No ref lines separated by blanks
blankLnSep = 0 # Flag to indicate if blanks lines separate ref lines
multi_nonblanks_fd = 0 # Flag to indicate if multiple nonblank lines are found together (used because
# if line is dbl-spaced, it isnt a blank that separates refs & cant be relied upon)
x = 0
max = len(self._dataLines)
while x < max:
m_ws = p_ws.search(self._dataLines[x])
if m_ws is None:
# ! empty line
numline = numline+1
x = x + 1 # Move past line
while x < len(self._dataLines) and p_ws.search(self._dataLines[x]) is None:
multi_nonblanks_fd=1
x = x + 1
x = x - 1
else:
# empty line
numblank = numblank + 1
x = x + 1
while x< len(self._dataLines) and p_ws.search(self._dataLines[x]) is not None:
x = x + 1
if x == len(self._dataLines):
# Blanks at end doc: dont count
numblank = numblank-1
x = x - 1
x = x + 1
# Now from data of num blank lines & num text lines, if numline>3, & numblank=numline or numblank=numline-1
# then we hav blank line separators between ref lines
if (numline > 3) and ((numblank == numline) or (numblank == numline - 1)) and (multi_nonblanks_fd):
blankLnSep = 1
return blankLnSep
def _rebuild(self, refNum):
"""Based on whether a reference line numeration pattern was found, either have the reference lines rebuild by the
identification of marker characters, or join all lines together if no numeration was found
"""
# Private internal function
def cleanAndAppendToRefsList(transformers, refList, line):
"""Before appending to list, process line with 'TextLineTransformers'"""
for x in transformers:
line = x.processLine(line)
sp = re.compile(unicode(r'^\s*$'),re.UNICODE)
if sp.match(line) is None:
refList.append(line)
rebuilt = []
lineTrans = []
tl = u''
# List of line transformers to clean up line:
lineTrans.append(URLRepairer())
lineTrans.append(EscapeSequenceTransformer())
lineTrans.append(MultispaceTruncator())
if refNum is None or (type(refNum) is not str and type(refNum) is not unicode):
if self._testBlankLineRefSeparators():
# Use blank lines to separate ref lines
refNum = unicode(r'^\s*$')
else:
# No ref line dividers: unmatchable pattern
refNum = unicode(r'^A$^A$$')
p_refNum = re.compile(refNum,re.I|re.UNICODE)
p_leadingws = re.compile(unicode(r'^\s+'))
p_trailingws = re.compile(unicode(r'\s+$'))
for x in range(len(self._dataLines)-1,-1,-1):
tstr = p_leadingws.sub(u'',self._dataLines[x])
tstr = p_trailingws.sub(u'',tstr)
m = p_refNum.match(tstr)
if m is not None:
# Ref line start marker
if tstr == '':
# Blank line to separate refs
tl = p_trailingws.sub(u'',tl)
cleanAndAppendToRefsList(lineTrans, rebuilt, tl)
tl = u''
else:
if tstr[len(tstr)-1] == u'-' or tstr[len(tstr)-1] == u' ':
tl = tstr + tl
else:
tl = tstr + u' ' + tl
tl = p_trailingws.sub(u'',tl)
cleanAndAppendToRefsList(lineTrans, rebuilt, tl)
tl = u''
else:
if tstr != u'':
# Continuation of line
if tstr[len(tstr) - 1] == u'-' or tstr[len(tstr) - 1] == u' ':
tl = tstr + tl
else:
tl = tstr + u' ' + tl
if tl != u'':
# Append last line
tl = p_trailingws.sub(u'',tl)
cleanAndAppendToRefsList(lineTrans, rebuilt, tl)
rebuilt.reverse()
d=self._testAndCorrectRebuiltLines(rebuilt, p_refNum)
if d is not None: rebuilt = d
return rebuilt
def _testAndCorrectRebuiltLines(self, rebuiltlines, p_refmarker):
"""EXPERIMENTAL METHOD. Try to correct any rebuild reference lines that have been given a bad reference number at the start. Needs testing."""
fixed = []
unsafe = False
try:
m = p_refmarker.match(rebuiltlines[0])
last_marknum = int(m.group("marknum"))
if last_marknum != 1:
return None # Even the first mark isnt 1 - probaby too dangerous to try to repair
except IndexError:
return None # Either no references or not a "numbered line marker" - cannot do anything
except AttributeError:
return None # No reference line marker (i.e. NoneType because couldn't match marker) - cannot do anything
fixed.append(rebuiltlines[0])
try:
for x in range(1,len(rebuiltlines)):
m = p_refmarker.match(rebuiltlines[x])
try:
if int(m.group("marknum")) == last_marknum + 1:
# All is well
fixed.append(rebuiltlines[x])
last_marknum += 1
continue
elif len(string.strip(rebuiltlines[x][m.end():])) == 0:
# this line consists of a number only. And it is not a coorrect marker. Add it to the last line:
fixed[len(fixed) - 1] += rebuiltlines[x]
continue
else:
# Problem maybe. May have taken some of the last line into this line. Can we find the next marker in this line?
m_fix = p_refmarker.search(rebuiltlines[x])
if m_fix is not None and int(m_fix.group("marknum")) == last_marknum + 1:
m_fix_test = re.match(u"%s\s*[A-Z]"%(m_fix.group(),))
if m_fix_test is not None:
movesect = rebuiltlines[x][0:m_fix.start()]
rebuiltlines[x] = rebuiltlines[x][m_fix.start():]
fixed[len(fixed) - 1] += movesect
fixed.append(rebuiltlines[x])
else:
unsafe = True
break
else:
unsafe = True
break
except AttributeError:
# This line does not have a line marker at the start! This line shall be added to the end of the previous line.
fixed[len(fixed) - 1] += rebuiltlines[x]
continue
except IndexError:
unsafe = True
if unsafe: return None
else: return fixed
def _getLineNumerationStyle(self, refStartInfo):
"""Try to determine the numeration marker style for the reference lines"""
mkregex = None
if refStartInfo.markerPatternPresent():
mkregex = refStartInfo.getMarkerPattern()
return mkregex
def _removeLeadingGarbageLines(self):
"""Sometimes, the first lines of the extracted references are completely blank or email addresses. These must be removed as they are not references"""
p_emptyline = re.compile(unicode(r'^\s*$'),re.UNICODE)
p_email = re.compile(unicode(r'^\s*e\-?mail'),re.UNICODE)
while (len(self._dataLines)>0) and (p_emptyline.match(self._dataLines[0]) is not None or p_email.match(self._dataLines[0]) is not None):
self._dataLines[0:1] = []
class DocumentConverter:
"""Abstract Class representing a document format conversion
tool which converts a document from one format to another
"""
def convertDocument(self, toConvert):
"""Document Conversion Method - returns a Document object"""
pass
def checkConvertFile(self, filePath):
"""Check that the file to convert is usable"""
pass
class OSDependentDocumentConverter(DocumentConverter):
"""ABSTRACT CLASS: Represents a document conversion tool which is a
separate program which needs to be executed via a call to the shell below
"""
def __init__(self):
self._converterSessionLink = self._convertCommand = ''
def setConvertCommand(self, filePath):
"""ABSTRACT METHOD: Set the shell command used for calling the
converter application. Declared abstract because it differs
according to which specific application is used
"""
pass
def getConvertCommand(self):
"""Return the shell command by which the conversion application is called"""
return self._convertCommand
def openConverterSession(self):
"""Open a session with the shell 'converter' application"""
if self._converterSessionLink is file:
self._converterSessionLink.close()
self._converterSessionLink = ""
self._converterSessionLink = os.popen(self.getConvertCommand(),'r')
def closeConverterSession(self):
"""Close session with the shell 'converter' application"""
if self._converterSessionLink is file:
self._converterSessionLink.close()
self._converterSessionLink = ""
def getConversionResult(self):
"""Return list of lines from shell conversion session"""
return self._converterSessionLink.readlines()
class PDFtoTextDocumentConverter(OSDependentDocumentConverter):
"""Converts PDF documents to ASCII plaintext documents"""
def __init__(self):
"""Initialise PDFtoTextDocumentConverter object"""
OSDependentDocumentConverter.__init__(self)
self._applicationPath = ''
self.setApplicationPath(cfg_refextract_pdftotext)
def setApplicationPath(self, newPath):
"""Set path to conversion application"""
self._applicationPath = newPath
def getApplicationPath(self):
"""Return the path to the conversion application"""
return self._applicationPath
def setConvertCommand(self, filePath):
"""Set up the command by which to call pdftotext application"""
self._convertCommand = self.getApplicationPath() + ' -raw -q -enc UTF-8 ' + filePath + ' -'
def getConversionResult(self):
mylines = []
for line in self._converterSessionLink: mylines.append(line.decode("utf-8"))
return mylines
def convertDocument(self, toConvert):
"""Perform a conversion from PDF to text, returning the document contents as a TextDocument object"""
if self._canAccessConvertFile(toConvert):
self.setConvertCommand(toConvert)
self.openConverterSession()
convRes = self.getConversionResult()
self.closeConverterSession()
if self._conversionIsBad(convRes):
# Bad conversion: empty document
textDoc = TextDocument()
else:
# Good conversion
textDoc = TextDocument(convRes)
else:
textDoc = TextDocument()
return textDoc
def _conversionIsBad(self, convertedLines):
"""Sometimes pdftotext performs a bad conversion which consists of many spaces and garbage characters.
This method takes a list of strings obtained from a pdftotext conversion and examines them to see if
they are likely to be the result of a bad conversion. Returns 1 if bad conversion, 0 if not
"""
# Numbers of 'words' and 'whitespaces' found in document:
numWords = numSpaces = 0
# whitespace line pattern:
ws_patt = re.compile(unicode(r'^\s+$'),re.UNICODE)
# whitespace character pattern:
p_space = re.compile(unicode(r'(\s)'),re.UNICODE)
# non-whitespace 'word' pattern:
p_noSpace = re.compile(unicode(r'(\S+)'),re.UNICODE)
for line in convertedLines:
numWords = numWords + len(p_noSpace.findall(line))
numSpaces = numSpaces + len(p_space.findall(line))
if numSpaces >= (numWords * 3):
# Too many spaces - probably bad conversion
return True
else:
return False
def _canAccessConvertFile(self, filePath):
"""Check that the path to the file to convert really exists and is readable by the shell"""
if os.access(filePath, os.R_OK): return True
else: return False
class PS2AsciiDocumentConverter(OSDependentDocumentConverter):
"""Converts PS documents to ASCII plaintext documents"""
def __init__(self):
"""Initialise PS2AsciiDocumentConverter object"""
OSDependentDocumentConverter.__init__(self)
self._catAppPath = self._gunzipAppPath = self._gsAppPath = ''
self.setCATapplicationPath(cfg_refextract_cat)
self.setGUNZIPapplicationPath(cfg_refextract_gunzip)
self.setGSapplicationPath(cfg_refextract_gs)
def setCATapplicationPath(self, catAppPath):
"""Set the path to the 'cat' application, used in conversion"""
self._catAppPath = catAppPath
def setGUNZIPapplicationPath(self, gunzipAppPath):
"""Set the path to the 'gunzip' application, used in conversion if the PS file has been zipped"""
self._gunzipAppPath = gunzipAppPath
def setGSapplicationPath(self, gsAppPath):
"""Set the path to the 'GhostScript' application, which is the means of calling 'ps2ascii'"""
self._gsAppPath = gsAppPath
def getCATapplicationPath(self):
"""Return the path to 'cat' as a string"""
return self._catAppPath
def getGUNZIPapplicationPath(self):
"""Return the path to 'gunzip' as a string"""
return self._gunzipAppPath
def getGSapplicationPath(self):
"""Return the path to 'gs' as a string"""
return self._gsAppPath
def setUnzippedPSConvertCommand(self, filePath):
"""Set converter command for unzipped PS file conversion"""
self._convertCommand = self.getCATapplicationPath() + " " + filePath + " | " + self.getGSapplicationPath() + " -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -c save -f ps2ascii.ps - -c quit"
def setZippedPSConvertCommand(self, filePath):
"""Set converter command for zipped PS file conversion"""
self._convertCommand = self.getGUNZIPapplicationPath() + " -c " + filePath + " | " + self.getGSapplicationPath() + " -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -c save -f ps2ascii.ps - -c quit"
def setConvertCommand(self, filePath):
"""Set up the shell command by which to call applications needed to perform the conversion"""
if re.search(r'(\w{2})$', filePath).group(0) == "ps":
self.setUnzippedPSConvertCommand(filePath)
else:
self.setZippedPSConvertCommand(filePath)
def _canAccessConvertFile(self, filePath):
"""Check that the path to the file to convert really exists and is readable by the shell"""
if os.access(filePath, os.R_OK): return True
else: return False
def _correctConvertFileName(self, filename):
"""Strip file extension from filename & replace with '.ps' or '.ps.gz' depending on which exists. If neither exist,
replace with no extension
"""
regexPattern = re.compile(r'(?P<fname>.*?)(\.\w+)?$')
match = regexPattern.search(filename)
name = match.group('fname')
if self._canAccessConvertFile(name+'.ps'): name = name + '.ps'
else: name = name + '.ps.gz'
return name
def convertDocument(self, toConvert):
"""This method performs a conversion from PS to text. If the file 'toConvert' exists and can be converted, a
TextDocument object is returned. If not, then an empty TextDocument is returned"""
toConvert = self._correctConvertFileName(toConvert)
if self._canAccessConvertFile(toConvert):
self.setConvertCommand(toConvert)
self.openConverterSession()
ps2asciiDoc = Ps2asciiEncodedTextDocument(self.getConversionResult())
# Convert the ps2asciiDoc to plaintext:
textDoc = ps2asciiDoc.convertToPlainText()
self.closeConverterSession()
else:
textDoc = TextDocument()
return textDoc
class BadKBLineError(Exception):
"""Exception thrown if a line in the periodicals knowledge base does not comply with the expected format"""
pass
class KnowledgeBase:
"""The knowledge base of periodical titles. Consists of search & replace terms. The search terms consist of non-standard periodical titles in upper case.
These are often found in the text of documents. Replacement terms consist of standardised periodical titles in a standardised case. These will be used to
replace identified non-standard titles
"""
def __init__(self, fn = None):
self._kb = {}
self._compiledPatternsKB = {}
self._unstandardisedTitle = {}
if type(fn) is str: self._buildKB(fn)
def _buildKB(self, fn):
"""From the filename provided (fn), read the periodicals knowledge base into memory, and build a dictionary of seek/replace values to be stored in self._kb"""
def _mychop(line):
if line[:-1] == u'\n':
line = line[:-1]
return line
try:
fh=open(fn, 'r')
p_kbLine = re.compile(unicode('^\s*(?P<seek>\w.*?)\s*---\s*(?P<repl>\w.*?)\s*$'),re.UNICODE)
for x in fh:
y = x.decode("utf-8")
y = _mychop(y)
m_kbLine = p_kbLine.search(y)
if m_kbLine is None:
raise BadKBLineError()
if len(m_kbLine.group('seek')) > 1:
# Only add KB line if the search term is more than 1 char in length
self._kb[m_kbLine.group('seek')] = m_kbLine.group('repl')
tmp_ptn = re.compile(unicode(r'\b(') + re.escape(m_kbLine.group('seek')) + unicode(r')[^A-Z0-9]'), re.UNICODE)
self._compiledPatternsKB[tmp_ptn] = m_kbLine.group('repl')
self._unstandardisedTitle[tmp_ptn] = m_kbLine.group('seek')
fh.close()
except IOError:
sys.exit('E: Cannot Open Knowledge Base File "%s".' % fn)
except (BadKBLineError, AttributeError):
sys.exit('E: Unexpected Line in Knowledge Base "%s".' % fn)
def display(self):
"""Display the contents of the KB on the standard output stream"""
print u"Knowledge Base Contents:"
for x in self._kb.keys():
sys.stdout.write("Search Term: '%s';\t\tReplace Term: '%s'\n" % (x.encode("utf-8"), (self._kb[x]).encode("utf-8")))
def findPeriodicalTitles(self, ln):
"""Identify periodical titles in text line 'ln' and record information about where in the line they occur. Replace them for lower-case versions or
lowercase letter 'a's if the match was numerical. Return a Tuple containing dictionaries containing information about the substitutions, along with the new line
"""
def _bytitlelen(a, b):
(aa,bb) = (self._unstandardisedTitle[a],self._unstandardisedTitle[b])
if len(aa) < len(bb): return 1
elif len(aa) == len(bb): return 0
else: return -1
def _byLen(a, b):
(aa,bb) = (a.pattern,b.pattern)
if len(aa) < len(bb): return 1
elif len(aa) == len(bb): return 0
else: return -1
foundMatch = False
title_match_len = {}
title_match_txt = {}
kb_keys = self._compiledPatternsKB.keys()
kb_keys.sort(_bytitlelen)
word_ptn = re.compile(unicode(r'^[ A-Z-a-z]+$'),re.UNICODE)
for t_ptn in kb_keys:
matches_iter = t_ptn.finditer(ln)
# Record dets of each match:
for m in matches_iter:
# Record match info
title_match_len[m.start()] = (len(m.group(0)) - 1)
title_match_txt[m.start()] = self._unstandardisedTitle[t_ptn]
# Replace matched txt in line with lowercase version (or n*'_' where n is len of match)
rep_str = m.group(1)
word_mtch = word_ptn.search(rep_str)
if word_mtch is None:
# None alpha/whitespace chars
rep_str = u'_'*len(rep_str)
else:
# Words
rep_str = rep_str.lower()
ln = u''.join([ln[0:m.start(1)],rep_str,ln[m.end(1):]])
if len(title_match_len) > 0: foundMatch = True
return (title_match_len, title_match_txt, ln, foundMatch)
def __getitem__(self, non_std_title):
"""Return the standardised title thought to be keyed by 'non_std_title'. Return None if not there"""
try: return self._kb[non_std_title]
except KeyError: return None
class PreprintClassificationItem:
def __init__(self, srch = '', repl = ''):
self._srchStr, self._rpStr = srch, repl
def setSearchString(self, sstr): self._srchStr = sstr
def setReplString(self, repstr): self._rpStr = repstr
def getSearchString(self): return self._srchStr
def getReplString(self): return self._rpStr
def getLength(self): return len(self._srchStr)
r_str = property(fget = getReplString, fset = setReplString)
s_str = property(fget = getSearchString, fset = setSearchString)
length = property(fget = getLength)
del setSearchString, setReplString, getSearchString, getReplString
del getLength
class Institute:
def __init__(self, nm):
self._name = nm
self._preprintCatsList = []
self._numerationList = []
self._numerationRegex = ""
self._preprintCatPatternsList = {}
def setName(self, nm): self._name = nm
def getName(self): return self._name
def display(self):
print u"----------------------"
print u"Name: " + self._name.encode("utf-8")
print u"Preprint Categories:"
for x in self._preprintCatsList: print u"Search:", x.s_str.encode("utf-8"), u"Replace With:", x.r_str.encode("utf-8")
print u"Numeration Styles List:"
for x in self._numerationList: print x
print u"Numeration Styles Regular expression List:"
print self._numerationRegex
print u"----------------------"
def _getPatternLenList(self):
"""Make a copy of the list of numeration patterns for an Institute object. Return this new list"""
nl = []
ccp = re.compile(unicode(r'\[[^\]]+\]'),re.UNICODE)
for x in self._numerationList:
# Remove the character class & append to newList
nx = ccp.sub(u'1', x)
nl.append((len(nx),x))
return nl
def _createPattern(self, ptn):
"""Accept a user-defined search pattern, transform it, according to some simple rules, into a regex pattern, then compile and return it as a compiled RE object
\ -> \\
9 -> \d
a -> [A-Za-z]
mm -> (0[1-9]|1[0-2])
yy -> \d{2}
yyyy -> [12]\d{3}
/ -> \/
"""
# Make the search/replace patterns:
s_r = []
s_r.append((re.compile(unicode(r'([^\]A-Za-z0-9\/\[ "])'),re.UNICODE), unicode(r'\\\g<1>')))
s_r.append((re.compile(u'9',re.UNICODE), unicode(r'\d')))
s_r.append((re.compile(u'a',re.UNICODE), unicode(r'[A-Za-z]')))
s_r.append((re.compile(u'mm',re.UNICODE), unicode(r'(0[1-9]|1[0-2])')))
s_r.append((re.compile(u'yyyy',re.UNICODE), unicode(r'[12]\d\d\d')))
s_r.append((re.compile(u'yy',re.UNICODE), unicode(r'\d\d')))
s_r.append((re.compile(unicode(r'\/'),re.UNICODE), unicode(r'\/')))
s_r.append((re.compile(unicode(r'\"([^"]+)\"'),re.UNICODE), unicode(r'\g<1>')))
s_r.append((re.compile(unicode(r' \[([^\]]+) \]'),re.UNICODE), unicode(r'( [\g<1>])?')))
for x in s_r:
ptn = x[0].sub(x[1], ptn)
return ptn
def _makeOrderedPtns(self, ptns):
"""Using the list ordered by lengths, produce a list of ordered regex patterns"""
p_list = u""
if len(ptns) > 0:
p_list = u"(?P<numn>"
for i in ptns: p_list += self._createPattern(i[1]) + u"|"
p_list = p_list[0:len(p_list)-1]
p_list += u")"
return p_list
def assignNumerationRegex(self):
"""Build the regex patterns for this institute's numeration styles"""
def _my_cmpfunc(a,b):
if a[0] < b[0]: return 1
elif a[0] == b[0]: return 0
else: return -1
# Remove user-defined character classes:
lenPtns = self._getPatternLenList()
lenPtns.sort(_my_cmpfunc)
# Set own list of regex patterns:
self._numerationRegex = self._makeOrderedPtns(lenPtns)
##
def _makeOrderedPtnsList(self, ptns):
p_list = []
if len(ptns) > 0:
for p in ptns:
p_itm = u"(?P<numn>"+self._createPattern(p[1])+u")"
p_list.append(p_itm)
return p_list
def assignNumerationRegexList(self):
"""Build the regex patterns for this institute's numeration styles"""
def _my_cmpfunc(a,b):
if a[0] < b[0]: return 1
elif a[0] == b[0]: return 0
else: return -1
# Remove user-defined character classes:
lenPtns = self._getPatternLenList()
lenPtns.sort(_my_cmpfunc)
# Set own list of regex patterns:
self._numerationRegexList = self._makeOrderedPtnsList(lenPtns)
def createTestPatternsList(self):
def _my_cmpfunc(a,b):
if a.length < b.length: return 1
elif a.length == b.length: return 0
else: return -1
self.assignNumerationRegexList()
self._preprintCatsList.sort(_my_cmpfunc)
preprintCatPatternsList = {}
for categ in self._preprintCatsList:
categptnslist = []
for num_ptn in self._numerationRegexList:
categptnslist.append(re.compile(unicode(r'\b((?P<categ>') + categ.s_str + u')' + num_ptn + r')',re.UNICODE))
preprintCatPatternsList[categ] = categptnslist
self._preprintCatPatternsList = preprintCatPatternsList
def matchCategs2(self, ln):
"""Accept a line. Try to find matches for each of the preprint categories of this institute within that line"""
def _my_cmpfunc(a,b):
if a.length < b.length: return 1
elif a.length == b.length: return 0
else: return -1
inst_full_len = {}
inst_RN_rep_str = {}
self._preprintCatsList.sort(_my_cmpfunc)
for categ in self._preprintCatsList:
for ptn in self._preprintCatPatternsList[categ]:
# Search for this categ in line:
matches_iter = ptn.finditer(ln)
for x in matches_iter:
# Get hyphenated numeration segment of category:
numnMatch = x.group('numn')
numnMatch = re.sub(r'\s', '-', numnMatch)
# Replace found categ in string with lowercase version:
foundCateg = x.group('categ')
foundCateg = foundCateg.lower()
ln = ln[0:x.start()] + foundCateg + ln[x.end('categ'):]
inst_full_len[x.start()] = len(x.group(0))
inst_RN_rep_str[x.start()] = categ.r_str + numnMatch
return (inst_full_len, inst_RN_rep_str, ln)
##
def matchCategs(self, ln):
"""Accept a line. Try to find matches for each of the preprint categories of this institute within that line"""
def _my_cmpfunc(a,b):
if a.length < b.length: return 1
elif a.length == b.length: return 0
else: return -1
inst_full_len = {}
inst_RN_rep_str = {}
self._preprintCatsList.sort(_my_cmpfunc)
for categ in self._preprintCatsList:
# Search for this categ in line:
# Make the regex:
my_ptn = re.compile(unicode(r'\b((?P<categ>') + categ.s_str + u')' + self._numerationRegex + r')',re.UNICODE)
# Perform the search:
matches_iter = my_ptn.finditer(ln)
# For each match, record its position, etc and replace it with lower-case version
for x in matches_iter:
# Get hyphenated numeration segment of category:
numnMatch = x.group('numn')
numnMatch = re.sub(r'\s', '-', numnMatch)
# Replace found categ in string with lowercase version:
foundCateg = x.group('categ')
foundCateg = foundCateg.lower()
ln = ln[0:x.start()] + foundCateg + ln[x.end('categ'):]
inst_full_len[x.start()] = len(x.group(0))
inst_RN_rep_str[x.start()] = categ.r_str + numnMatch
return (inst_full_len, inst_RN_rep_str, ln)
def addCategory(self, k, v): self._preprintCatsList.append(PreprintClassificationItem(k,v))
def addNumerationStyle(self, num): self._numerationList.append(num)
name = property(fget = getName, fset = setName)
del setName, getName
class InstituteList:
def __init__(self, fn = ''):
self._iList = self._getInstituteList(fn)
self._buildInstNumtnRegexs()
def _buildInstNumtnRegexs(self):
for i in self._iList: i.createTestPatternsList()
def display(self):
for x in self._iList: x.display()
def _getInstituteList(self, fn):
"""Read the list of institutes in from the file and return an institute list. Terminates execution if cant read the file"""
try:
fh = open(fn, 'r')
iList = []
p_instName = re.compile(unicode(r'^\#{5}\s*(.+)\s*\#{5}$'),re.UNICODE)
p_prepClass = re.compile(unicode(r'^\s*(\w.*?)\s*---\s*(\w.*?)\s*$'),re.UNICODE)
p_numtn = re.compile(unicode(r'^\<(.+)\>$'),re.UNICODE)
for x in fh:
y = x.decode("utf-8")
m_instName = p_instName.search(y)
m_prepClass = p_prepClass.search(y)
m_numtn = p_numtn.search(y)
if m_instName is not None:
curInst = Institute(m_instName.group(1))
iList.append(curInst)
elif m_prepClass is not None:
try: curInst.addCategory(m_prepClass.group(1), m_prepClass.group(2))
except AttributeError, NameError: pass
elif m_numtn is not None:
try: curInst.addNumerationStyle(m_numtn.group(1))
except AttributeError, NameError: pass
fh.close()
return iList
except IOError:
import sys
sys.exit('E: Cannot Open Institutes File "%s".' % fn)
def identifyPreprintReferences(self, ln):
"""Accept a line of text (String) and search it against the institutes records held in order to identify references to an institutes preprints"""
foundMatch = False
identified_pp_len = {}
identified_pp_repStr = {}
for inst in self._iList:
#(tmp_id_lens, tmp_id_repStrs, ln) = inst.matchCategs(ln)
(tmp_id_lens, tmp_id_repStrs, ln) = inst.matchCategs2(ln)
identified_pp_len.update(tmp_id_lens)
identified_pp_repStr.update(tmp_id_repStrs)
if len(identified_pp_len) > 0: foundMatch = True
return (identified_pp_len, identified_pp_repStr, ln, foundMatch)
class LineIBIDidentifier:
"""Class to identify and record information about IBID ocurrences in a text line"""
def __init__(self):
"""Initialise regex pattern used to identify an IBID item"""
self._p_ibid = re.compile(unicode(r'(-|\b)(IBID\.?( ([A-H]|(I{1,3}V?|VI{0,3})|[1-3]))?)\s?:'),re.UNICODE)
self._pIbidPresent = re.compile(unicode(r'IBID\.?\s?([A-H]|(I{1,3}V?|VI{0,3})|[1-3])?'),re.UNICODE)
def lineHasIbid(self, ln):
m_ibidPresent = self._pIbidPresent.search(ln)
if m_ibidPresent is not None: return True
else: return False
def getIbidSeriesLetter(self, ln):
m_ibid = self._pIbidPresent.search(ln)
try: series_letter = m_ibid.group(1)
except IndexError: series_letter = u""
if series_letter is None: series_letter = u""
return series_letter
def identify_record_ibids(self, ln):
"""Identify the IBIDs in "line". Record their information (index position in line, match length, and matched text. When identified, the word IBID
is replaced with a lower-case version of itself Finally, the line is returned with all IBIDs identified, along with a lists of the identified
IBID text and length. These 3 items are returned in a tuple.
"""
ibid_match_len = {}
ibid_match_txt = {}
matches_iter = self._p_ibid.finditer(ln)
# Record dets of each match:
for m in matches_iter:
# Record match info
ibid_match_len[m.start()] = len(m.group(2))
ibid_match_txt[m.start()] = m.group(2)
# Replace matched txt in line with
# Lowercase version
rep_str = m.group(2)
rep_str = rep_str.lower()
ln = ln[0:m.start(2)] + rep_str + ln[m.end(2):]
return (ibid_match_len, ibid_match_txt, ln)
class URLidentifier:
"""Identify, record information about, and remove URLs from a line"""
def __init__(self):
"""Initialise url recognition patterns"""
self._urlstr = unicode(r'((https?|s?ftp):\/\/([\w\d\_\.\-])+(\/([\w\d\_\.\-])+)*(\/([\w\d\_\-]+\.\w{1,6})?)?)')
self._p_rawURL = re.compile(self._urlstr,re.UNICODE|re.I)
self._p_taggedURL = re.compile(unicode(r'(\<a\s+href\s*=\s*([\'"])?(((https?|s?ftp):\/\/)?([\w\d\_\.\-])+(\/([\w\d\_\.\-])+)*(\/([\w\d\_\-]+\.\w{1,6})?)?)([\'"])?\>([^\<]+)\<\/a\>)'),re.UNICODE|re.I)
def removeURLs(self, ln):
# Find URLS in tags:
urlfound = False
found_urlmatch_fulllen = {}
found_urlstr = {}
found_urldescstr = {}
# Record and remove tagged URLs found in line
m_taggedURL_iter = self._p_taggedURL.finditer(ln)
for m in m_taggedURL_iter:
urlfound = True
startpos = m.start()
endpos = m.end()
matchlen = len(m.group())
found_urlmatch_fulllen[startpos] = matchlen
found_urlstr[startpos] = m.group(3)
found_urldescstr[startpos] = m.group(12)
ln = ln[0:startpos] + u"_"*matchlen + ln[endpos:]
# Record and remove raw URLs found in line:
m_rawURL_iter = self._p_rawURL.finditer(ln)
for m in m_rawURL_iter:
urlfound = True
startpos = m.start()
endpos = m.end()
matchlen = len(m.group())
found_urlmatch_fulllen[startpos] = matchlen
found_urlstr[startpos] = m.group(1)
found_urldescstr[startpos] = m.group(1)
ln = ln[0:startpos] + u"_"*matchlen + ln[endpos:]
return (found_urlmatch_fulllen, found_urlstr, found_urldescstr, urlfound, ln)
class ProcessedReferenceLineBuilder:
"""Create a "ProcessedReferenceLine" from a reference line and information about where any matched items are"""
def __init__(self, titles_list, ibid_identifier, numeration_processor, line_cleaner):
self._titleslist = titles_list
self._ibidIdentifier = ibid_identifier
self._numerationprocessor = numeration_processor
self._linecleaner = line_cleaner
self._p_lineMarker = RefLineNumerationListCompiler().getCompiledPatternList()
self._searcher = LineSearcher()
self._p_tagFinder = re.compile(unicode(r'(\<cds\.(TITLE|VOL|YR|PG|RN|SER|URI value="[^\>]+")\>)'),re.UNICODE)
self._p_leadRubbishRemover = re.compile(unicode(r'^([\.,;:-]+|\s+)+'),re.UNICODE)
self._p_getNumatn = re.compile(unicode(r'^(\s*.?,?\s*:\s\<cds\.VOL\>(\d+)\<\/cds\.VOL> \<cds\.YR\>\(([1-2]\d\d\d)\)\<\/cds\.YR\> \<cds\.PG\>([RL]?\d+[c]?)\<\/cds\.PG\>)'),re.UNICODE)
def _buildProcessedLine(self,ln,rawline):
"""Given a potentially marked up reference line, build and return a "ProcessedReferenceLine" object"""
processedLine = ProcessedReferenceLine()
linebckp = ln
ln = string.lstrip(ln)
# Trim line marker from start of line if possible & add it as a line segment
m_lineMarker = self._searcher.findAtStartLine(ln, self._p_lineMarker)
if m_lineMarker is not None:
processedLine.addSection(LineMarker(m_lineMarker.group(u'mark')))
ln = ln[m_lineMarker.end():]
else:
processedLine.addSection(LineMarker(u" "))
m_tag = self._p_tagFinder.search(ln)
thismisc = u""
while m_tag is not None:
# Found citation markup tag in line
tagtype = m_tag.group(2)
if tagtype == u"TITLE":
# Title section
thisyr = thispg = thisvol = None
# Get text up to point of this match:
if len(self._p_leadRubbishRemover.sub(u"",ln[0:m_tag.start()])) > 0: thismisc += ln[0:m_tag.start()]
m_titletxt = re.match(unicode(r'^(%s(\<cds\.TITLE\>([^\<]+)\<\/cds\.TITLE\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thistitle = m_titletxt.group(3)
ln = ln[m_titletxt.end():]
# Remove and add volume, year and pagination tags which follow title if present
m_numatn = self._p_getNumatn.match(ln)
if m_numatn is not None:
thisvol = m_numatn.group(2)
thisyr = m_numatn.group(3)
thispg = m_numatn.group(4)
ln = ln[m_numatn.end():]
if len(thismisc) == 0: thismisc = None
processedLine.addSection(TitleCitationStandard(thistitle, thismisc, thispg, thisvol, thisyr))
thismisc = u""
else:
thismisc += u" " + thistitle
elif tagtype == u"RN":
# Preprint reference number section
# Get misc text up to point of match
if len(self._p_leadRubbishRemover.sub(u"",ln[0:m_tag.start()])) > 0: thismisc += ln[0:m_tag.start()]
m_rntxt = re.match(unicode(r'^(%s(\<cds\.RN\>([^\<]+)\<\/cds\.RN\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thisrn = m_rntxt.group(3)
ln = ln[m_rntxt.end():]
if len(thismisc) == 0: thismisc = None
processedLine.addSection(InstitutePreprintReferenceCitation(thisrn, thismisc))
thismisc = u""
elif string.find(tagtype ,u"URI") == 0:
# URL found
# Get misc text up to point of match
if len(self._p_leadRubbishRemover.sub(u"",ln[0:m_tag.start()])) > 0: thismisc += ln[0:m_tag.start()]
m_urlinfo = re.match(unicode(r'^(%s(\<cds\.URI value\=\"([^\>]+)\"\>([^\<]+)\<\/cds\.URI\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thisurl = m_urlinfo.group(3)
thisurldescr = m_urlinfo.group(4)
if len(thisurldescr) == 0: thisurldescr = thisurl
if len(thismisc) == 0: thismisc = None
processedLine.addSection(URLCitation(thisurl, thisurldescr, thismisc))
thismisc = u""
ln = ln[m_urlinfo.end():]
elif tagtype == u"VOL":
# Volume info - it wasnt found after a title, so treat as misc
thismisc += ln[0:m_tag.start()]
m_voltxt = re.match(unicode(r'^(%s(\<cds\.VOL\>(\d+)\<\/cds\.VOL\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thismisc += m_voltxt.group(3)
ln = ln[m_voltxt.end():]
elif tagtype == u"YR":
# Year info - discard as misc since not found after title info
thismisc += ln[0:m_tag.start()]
m_yrtxt = re.match(unicode(r'^(%s(\<cds\.YR\>(\([1-2]\d\d\d\))\<\/cds\.YR\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thismisc += m_yrtxt.group(3)
ln = ln[m_yrtxt.end():]
elif tagtype == u"PG":
# Pagination info - discard since not found after title info
thismisc += ln[0:m_tag.start()]
m_pgtxt = re.match(unicode(r'^(%s(\<cds\.PG\>([RL]?\d+[c]?)\<\/cds\.PG\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thismisc += m_pgtxt.group(3)
ln = ln[m_pgtxt.end():]
elif tagtype == u"SER":
# Series info - discard since not after title info (should have been caught earlier infact)
thismisc += ln[0:m_tag.start()]
m_sertxt = re.match(unicode(r'^(%s(\<cds\.SER\>([A-H]|(I{1,3}V?|VI{0,3}))\<\/cds\.SER\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thismisc += m_sertxt.group(3)
ln = ln[m_sertxt.end():]
else:
# Unknown tag (never happen) - discard as misc
thismisc += ln[0:m_tag.start()]
m_uknowntag = re.match(unicode(r'^(%s(\<cds\.[^\>]+?\>([^\<]+?)\<\/cds\.[^\>]+?\>))'%(re.escape(ln[0:m_tag.start()]),)),ln,re.UNICODE)
thismisc += m_uknowntag.group(3)
ln = ln[m_uknowntag.end():]
m_tag = self._p_tagFinder.search(ln)
if processedLine.getNumberCitations() == 0 and cfg_refextract_no_citation_treatment == 0:
# No Citations were found and strict mode in use demanding that when no citations are found the entire ORIGINAL, UNTOUCHED line be marked up into misc
processedLine = ProcessedReferenceLine()
untouchedline = string.lstrip(rawline)
m_lineMarker = self._searcher.findAtStartLine(untouchedline, self._p_lineMarker)
if m_lineMarker is not None:
processedLine.addSection(LineMarker(m_lineMarker.group(u'mark')))
untouchedline = untouchedline[m_lineMarker.end():]
else:
processedLine.addSection(LineMarker(u" "))
if len(self._p_leadRubbishRemover.sub(u"",untouchedline)) > 0:
processedLine.addSection(LineMiscellaneousText(untouchedline))
else:
thismisc += ln
if len(self._p_leadRubbishRemover.sub(u"",thismisc)) > 0:
processedLine.addSection(LineMiscellaneousText(thismisc))
return processedLine
def getProcessedReferenceLine(self, titlematch_len, titlematch_str, pprintmatch_str, pprintmatch_len, urlmatchfull_len, urlmatch_str, url_desc_str,\
removed_spaces, rawline, original_line, working_line, foundCitations):
marked_line = u"" # line after titles etc have been recognised & marked up with "<cds.TITLE/>" etc tags
if not foundCitations:
marked_line = original_line
else:
# Rebuild line with citations marked up and standardised:
start_pos = 0 # First cell of the reference line...
last_match = u""
extras = 0 # Variable to count the extra spaces to add
series_letter = u""
replacement_types = {}
url_keys = urlmatch_str.keys()
url_keys.sort()
title_keys = titlematch_str.keys()
title_keys.sort()
pp_keys = pprintmatch_str.keys()
pp_keys.sort()
spaces_keys = removed_spaces.keys()
spaces_keys.sort()
# First, adjust the index replacement values of the URI replacements as they were made before the multispaces etc were
# stripped & other replacements made after this could therefore have the same replacement indeces
uri_virtual_locations = self._getVirtualUrlPositions(url_keys, spaces_keys, removed_spaces)
# Make dictionary containing the types of replacements to be made at each position:
rep_types = self._getReplacementTypes(uri_virtual_locations,title_keys,pp_keys)
rep_types_keys = rep_types.keys()
rep_types_keys.sort()
# Begin the rebuild:
for repidx in rep_types_keys:
true_repidx = repidx
spare_repidx = repidx
extras = 0
# Account for any spaces stripped before these values:
(true_repidx,spare_repidx,extras) =\
self._addExtraStrippedSpaces(spaces_keys,removed_spaces,rep_types,pprintmatch_len,titlematch_len,true_repidx,spare_repidx,repidx,extras)
if rep_types[repidx] == u"TITLE":
# Process addition of text into line for title:
(marked_line,start_pos,last_match) = self._addLineTitle(titlematch_str,titlematch_len,original_line,marked_line,start_pos,repidx,true_repidx,extras,last_match)
elif rep_types[repidx] == u"RN":
# Process addition of text into line for preprint reference:
(marked_line,start_pos) = self._replaceLineItemPreprintRef(pprintmatch_str,pprintmatch_len,original_line,marked_line,start_pos,repidx,true_repidx,extras)
elif rep_types[repidx] == u"URI":
# Process addition of text into line for URL:
(marked_line,start_pos) = self._addLineURI(urlmatch_str,url_desc_str,urlmatchfull_len,uri_virtual_locations,original_line,marked_line,start_pos,repidx,true_repidx)
marked_line = marked_line + original_line[start_pos:]
marked_line = self._numerationprocessor.restandardise(marked_line)
marked_line = self._numerationprocessor.removeSeriesTags(marked_line) # Remove any "Series tags"
marked_line = self._linecleaner.clean(marked_line)
return self._buildProcessedLine(marked_line,rawline)
def _replaceIbid(self,series_letter,last_match,rebuiltLine,ibid_str):
"""Replace an IBID occurrence in a line with the "last matched" title in the line. Also take into account a new series letter governed by the ibid"""
if series_letter != u"":
# IBID to replace has a series letter, so if the last matched title had a series letter, this must be changed to the new series letter
if string.find(last_match,",") != -1:
# Presence of comma signifies possible series information. Only replace if it is a single item (e.g. "A")
m_lastMatch = re.search(unicode(r'\, +([A-H]|(I{1,3}V?|VI{0,3}))$'),last_match,re.UNICODE)
if m_lastMatch is not None:
temp_series = m_lastMatch.group(1)
if temp_series == series_letter:
rebuiltLine = rebuiltLine + u" <cds.TITLE>" + last_match + u"</cds.TITLE>"
else:
last_match = re.sub(u"(\\.?)(,?) %s$"%(temp_series,),u"\\g<1>\\g<2> %s"%(series_letter,),last_match)
rebuiltLine = rebuiltLine + u" <cds.TITLE>" + last_match + u"</cds.TITLE>"
else:
# Series info of last match not letter or roman numeral: cannot be sure about meaning of IBID - dont replace it
rebuiltLine = rebuiltLine + ibid_str
else:
# Match had no series letter but IBID did. Add comma followed by IBID series letter to last match, then add it
last_match = string.rstrip(last_match)
if last_match[-1] == u".":
last_match = last_match + u", " + series_letter
else:
# Last match end with space - replace all spaces at end
last_match = last_match + u"., " + series_letter
rebuiltLine = rebuiltLine + u" <cds.TITLE>" + last_match + u"</cds.TITLE>"
else:
# IBID has no series letter. Replace as-is:
rebuiltLine = rebuiltLine + u" <cds.TITLE>" + last_match + u"</cds.TITLE>"
return (rebuiltLine,last_match)
def _addLineTitle(self,titlematch_str,titlematch_len,orig_line,rebuiltLine,start_pos,repidx,true_repidx,extras,last_match):
rebuiltLine=rebuiltLine+orig_line[start_pos:true_repidx]
series_letter = u""
#if self._ibidIdentifier.lineHasIbid(titlematch_str[repidx]):
if titlematch_str[repidx].upper().find(u"IBID") != -1:
# Replace IBID item
# Get series letter
series_letter = self._ibidIdentifier.getIbidSeriesLetter(titlematch_str[repidx])
if last_match != "":
# Replacement has already been made in this line. IBID can therefore be replaced
(rebuiltLine,last_match) = self._replaceIbid(series_letter, last_match, rebuiltLine, titlematch_str[repidx])
start_pos=true_repidx+titlematch_len[repidx]+extras
if orig_line[start_pos] == u"." or orig_line[start_pos] == u":" or\
orig_line[start_pos] == u";":
# Skip past ".:;" which may have followed an IBID:
start_pos=start_pos+1
else:
# No replacements made in this line before this IBID (its a line with an IBID and
# we dont know what the IBID refers to..ignore it
rebuiltLine = rebuiltLine + orig_line[true_repidx:true_repidx + titlematch_len[repidx] + extras]
start_pos=true_repidx+titlematch_len[repidx]+extras
else:
# Normal title replacement - not an IBID
# Skip past any "[" or "(" chars
rebuiltLine = rebuiltLine + u"<cds.TITLE>" + self._titleslist[titlematch_str[repidx]] + u"</cds.TITLE>"
last_match = self._titleslist[titlematch_str[repidx]]
start_pos = true_repidx+titlematch_len[repidx]+extras
if orig_line[start_pos] == u"." or orig_line[start_pos] == u":" or\
orig_line[start_pos] == u";":
# Skip past punctuation at end of title
start_pos = start_pos + 1
return (rebuiltLine,start_pos,last_match)
def _replaceLineItemPreprintRef(self,pprintmatch_str,pprintmatch_len,orig_line,rebuiltLine,start_pos,repidx,true_repidx,extras):
"""Replace a Preprint reference item in the line with a marked-up, standardised version of itself"""
# Often pprint refs are enclosed in "[]" chars which we dont want. Stop 1 char before this if possible:
if (true_repidx - start_pos - 1) >= 0:
rebuiltLine = rebuiltLine + orig_line[start_pos:true_repidx - 1]
else:
rebuiltLine = rebuiltLine + orig_line[start_pos:true_repidx]
# Is next char a "[" or "("? Skip past it if yes:
if orig_line[true_repidx] == u"[" or \
orig_line[true_repidx] == u"(":
rebuiltLine = rebuiltLine + u" - "
else:
rebuiltLine = rebuiltLine + orig_line[true_repidx-1]
rebuiltLine = rebuiltLine + u"<cds.RN>" + pprintmatch_str[repidx] + u"</cds.RN>"
start_pos = true_repidx + pprintmatch_len[repidx] + extras
try:
if orig_line[start_pos] == u"]" or orig_line[start_pos] == u")":
# Skip past preprint ref no closing brace
start_pos = start_pos + 1
except IndexError:
# Went past end of line. Ignore.
pass
return (rebuiltLine, start_pos)
def _addLineURI(self,urlmatch_str,urldesc_str,urlmatchfull_len,uri_virtual_locations,orig_line,rebuiltLine,start_pos,repidx,true_repidx):
rebuiltLine = rebuiltLine + orig_line[start_pos:start_pos + true_repidx - start_pos]
rebuiltLine = rebuiltLine + u"<cds.URI value=\"" + urlmatch_str[uri_virtual_locations[repidx]] + u"\">" + urldesc_str[uri_virtual_locations[repidx]] + u"</cds.URI>"
start_pos = true_repidx + urlmatchfull_len[uri_virtual_locations[repidx]]
return (rebuiltLine, start_pos)
def _addExtraStrippedSpaces(self, spacesKeys, removed_spaces, rep_types, pprintmatch_len, titlematch_len, true_repidx, spare_repidx, repidx, extras):
"""For a replacement index position, calculate a new (correct) replacement index, based on any spaces that have been removed before it, according to the type of the replacement"""
for strip_space in spacesKeys:
if strip_space < true_repidx:
# Spaces were removed before this replacement item should be placed. Add number of spaces removed to current replacement idx:
true_repidx = true_repidx + removed_spaces[strip_space]
spare_repidx = spare_repidx + removed_spaces[strip_space]
elif (strip_space >= spare_repidx) and (rep_types[repidx] == u"TITLE") and\
(strip_space < (spare_repidx + titlematch_len[repidx])):
# Replacing a periodical title. Account for double spaces that may have been removed
# from the title before it was recognised.
spare_repidx = spare_repidx + removed_spaces[strip_space]
extras = extras + removed_spaces[strip_space]
elif (strip_space >= spare_repidx) and (rep_types[repidx] == u"RN") and\
(strip_space < (spare_repidx + pprintmatch_len[repidx])):
# Replacing an institute preprint reference. Spaces would have been removed from this
# pprint reference itself, and must therefore be added
spare_repidx = spare_repidx + removed_spaces[strip_space]
extras = extras + removed_spaces[strip_space]
return (true_repidx, spare_repidx, extras)
def _getReplacementTypes(self,urls,titles,preprints):
"""Make dictionary detailing the type of replacement made at each position"""
rep_types = {}
for idx in urls:
rep_types[idx] = u"URI"
for idx in titles:
rep_types[idx] = u"TITLE"
for idx in preprints:
rep_types[idx] = u"RN"
return rep_types
def _getVirtualUrlPositions(self, url_keys, spaces_keys, removed_spaces):
"""URLs were removed before punctuation and multiple spaces were recorded and stripped. This method makes a dictionary of
URL positions as-if the URLs had been identified/removed after the punctuation/spaces
"""
uri_virtual_locations = {}
for idx in url_keys:
virtual_pos = idx
for spcidx in spaces_keys:
if spcidx < idx:
# Spaces were removed before this URL. Account for this.
virtual_pos = virtual_pos - removed_spaces[spcidx]
# All spaces removed before this URL accounted for - add it to the dictionary
uri_virtual_locations[virtual_pos] = idx
return uri_virtual_locations
class ReferenceSectionMarkupProcessor:
"""Process a reference section. Line will be cleaned, and cited items will be identified and their notation standardised. ProcessedReferenceLine will be returned"""
def __init__(self, institutes, titles):
self._instlist = institutes
self._titleslist = titles
self._ibidIdentifier = LineIBIDidentifier()
self._numerationIdentifier = NumerationHandler()
self._lineCleaner = LineCleaner()
self._lineBuilder = ProcessedReferenceLineBuilder(self._titleslist, self._ibidIdentifier, self._numerationIdentifier, self._lineCleaner)
self._accentTransformer = EscapeSequenceTransformer()
self._punctuationStripper = PunctuationStripper()
self._multispaceRemover = MultispaceRemover()
self._urlRemover = URLidentifier()
def getProcessedReferenceSection(self, refSect):
"""Take a ReferenceSection as argument. For each line, process it"""
processedRefSection = ProcessedReferenceSection()
for line in refSect:
citationMatch=False
found_ibids_len = {}
found_ibids_matchtxt = {}
found_title_len = {}
found_title_txt = {}
tmpLine = line.getContent() # Got line as unicode string
# Remove and record details of URLs
#(found_urlmatch_fulllen, found_urlstr, found_urldescstr, foundItem, tmpLine) = self._urlRemover.removeURLs(tmpLine)
found_urlmatch_fulllen = {}
found_urlstr = {}
found_urldescstr = {}
foundItem = False
if foundItem: citationMatch=True
# Preliminary line cleaning: transform bad accents, clean punctuation & remove dbl-spaces
tmpLine = self._accentTransformer.processLine(tmpLine)
tmpLine = self._lineCleaner.clean(tmpLine)
# Standardise numeration:
tmpLine = self._numerationIdentifier.standardise(tmpLine)
tmpLine = self._lineCleaner.clean(tmpLine)
# ---> Standardise the titles:
tmpLine2 = string.upper(tmpLine) # uppercase the line
tmpLine2 = self._punctuationStripper.strip(tmpLine2) # Strip punctuation
(removedSpaces,tmpLine2) = self._multispaceRemover.recordRemove(tmpLine2) # remove multispace & record their positions
(found_pp_len, found_pp_rep_str, tmpLine2, foundItem) = self._instlist.identifyPreprintReferences(tmpLine2)
if foundItem: citationMatch=True
# find_nonstandard_titles
(found_title_len,found_title_txt,tmpLine2,foundItem) = self._titleslist.findPeriodicalTitles(tmpLine2)
if foundItem: citationMatch=True
# If there is an IBID in the line, do a 2nd pass to try to catch it & identify its meaning
if tmpLine2.upper().find(u"IBID") != -1:
# Record/remove IBID(s) in line
(found_ibids_len,found_ibids_matchtxt,tmpLine2) = self._ibidIdentifier.identify_record_ibids(tmpLine2)
# Add found ibids to title matches:
for itm in found_ibids_len.keys(): found_title_len[itm] = found_ibids_len[itm]
for itm in found_ibids_matchtxt.keys(): found_title_txt[itm] = found_ibids_matchtxt[itm]
# Create "ProcessedReferenceLine":
thisProcessedLine = self._lineBuilder.getProcessedReferenceLine(found_title_len,found_title_txt,found_pp_rep_str,found_pp_len,\
found_urlmatch_fulllen, found_urlstr, found_urldescstr,removedSpaces,line.getContent(),tmpLine,tmpLine2,citationMatch)
processedRefSection.appendLine(thisProcessedLine)
return processedRefSection
class LineItem:
def getSelfMARCXML(self):
"""Return self, as marc xml string"""
pass
class LineMarker(LineItem):
def __init__(self, val):
if type(val) is str or type(val) is unicode: self._value = val
else: self._value = u""
def getSelfMARCXML(self):
return u""" <datafield tag="999" ind1="C" ind2="5">
<subfield code="o">""" + cgi.escape(self._value)+u"""</subfield>
</datafield>\n"""
class LineMiscellaneousText(LineItem):
def __init__(self, val):
if type(val) is str or type(val) is unicode: self._value = val.strip()
else: self._value = u""
def getSelfMARCXML(self):
return u""" <datafield tag="999" ind1="C" ind2="5">
<subfield code="m">"""+cgi.escape(self._value)+u"""</subfield>
</datafield>\n"""
class Citation(LineItem):
"""Abstract - represents a citation instance. Could be used to count citations found in a line"""
pass
class TitleCitation(Citation):
def __init__(self, title, misc = None, pg = None, vol = None, yr = None):
self._title = title
if misc is not None: self._misc = misc.strip()
else: self._misc = misc
self._page = pg
self._volume = vol
self._yr = yr
def getSelfMARCXML(self):
out = u""" <datafield tag="999" ind1="C" ind2="5">\n"""
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str):
out += u""" <subfield code="m">"""+cgi.escape(self._misc)+u"""</subfield>\n"""
out += u""" <subfield code="t">"""+cgi.escape(self._title)+u"""</subfield>\n"""
if self._page is not None and (type(self._page) is unicode or type(self._page) is str):
out += u""" <subfield code="p">"""+cgi.escape(self._page)+u"""</subfield>\n"""
if self._volume is not None and (type(self._volume) is unicode or type(self._volume) is str):
out += u""" <subfield code="v">"""+cgi.escape(self._volume)+u"""</subfield>\n"""
if self._yr is not None and (type(self._yr) is unicode or type(self._yr) is str):
out += u""" <subfield code="y">"""+cgi.escape(self._yr)+u"""</subfield>\n"""
out += u""" </datafield>\n"""
return out
class TitleCitationStandard(Citation):
"""[journal name] [volume] ([year]) [pagination]"""
def __init__(self, title, misc = None, pg = None, vol = None, yr = None):
self._title = title
if misc is not None: self._misc = misc.strip()
else: self._misc = misc
self._page = pg
self._volume = vol
self._yr = yr
def hasMisc(self):
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str) and len(self._misc.strip("()[], {}-")) > 0 or not\
(self._title is not None and self._page is not None and self._volume is not None and self._yr is not None):
return True
else:
return False
def getS_subfield(self):
if self._title is not None and self._page is not None and self._volume is not None and self._yr is not None:
return u""" <subfield code="s">%s %s (%s) %s</subfield>\n"""%(self._title,self._volume,self._yr,self._page)
else:
return None
def getSelfMARCXML(self, xtra_subfield=None):
subfieldOpen = False
out = u""" <datafield tag="999" ind1="C" ind2="5">\n"""
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str):
out += u""" <subfield code="m">"""+cgi.escape(self._misc)
subfieldOpen=True
if self._title is not None and self._page is not None and self._volume is not None and self._yr is not None:
if subfieldOpen:
out += u"""</subfield>\n"""
subfieldOpen=False
out += u""" <subfield code="s">%s %s (%s) %s</subfield>\n"""%(self._title,self._volume,self._yr,self._page)
else:
if not subfieldOpen:
out += u""" <subfield code="m">"""
subfieldOpen = True
if self._title is not None: out += u" %s"%(self._title,)
if self._title is not None: out += u" %s"%(self._volume,)
if self._title is not None: out += u" (%s)"%(self._yr,)
if self._title is not None: out += u" %s"%(self._page,)
if subfieldOpen:
out += u"""</subfield>\n"""
subfieldOpen=False
if xtra_subfield is not None:
out += xtra_subfield
out += u""" </datafield>\n"""
return out
class InstitutePreprintReferenceCitation(Citation):
def __init__(self, rn, misc = None):
self._rn = rn
if misc is not None and len(misc.strip("()[], {}-")) > 0: self._misc = misc.strip()
else: self._misc = None
def hasMisc(self):
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str) and len(self._misc.strip()) > 0:
return True
else:
return False
def getRN_subfield(self):
return u""" <subfield code="r">"""+cgi.escape(self._rn)+u"""</subfield>\n"""
def getSelfMARCXML(self, xtra_subfield=None):
out = u""" <datafield tag="999" ind1="C" ind2="5">\n"""
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str):
out += u""" <subfield code="m">"""+cgi.escape(self._misc)+u"""</subfield>\n"""
out += u""" <subfield code="r">"""+cgi.escape(self._rn)+u"""</subfield>\n"""
if xtra_subfield is not None:
out += xtra_subfield
out += u""" </datafield>\n"""
return out
class URLCitation(Citation):
def __init__(self, url, urldescr, misc=None):
self._url = url
self._urldescr = urldescr
if misc is not None: self._misc = misc.strip()
else: self._misc = misc
def getSelfMARCXML(self):
out = u""" <datafield tag="999" ind1="C" ind2="5">\n"""
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str):
out += u""" <subfield code="m">"""+cgi.escape(self._misc)+u"""</subfield>\n"""
out += u""" <subfield code="u">"""+cgi.escape(self._url)+u"""</subfield>\n"""
out += u""" <subfield code="z">"""+cgi.escape(self._urldescr)+u"""</subfield>\n"""
out += u""" </datafield>\n"""
return out
class ProcessedReferenceLine:
"""This is a reference line that has been processed for cited items"""
def __init__(self):
self._segments = {} # Segments of reference line, each keyed by start point index. Each is a 'LineItem'.
self._nextposn = 0
def getSelfMARCXML(self):
"""Return an XML string containing this lines contents, marked up in XML MARC, as used in CDS"""
i = 0
lenline = len(self._segments)
out = u""
while i < lenline:
if isinstance(self._segments[i],TitleCitationStandard) and i < lenline-1 and isinstance(self._segments[i+1],InstitutePreprintReferenceCitation) and not self._segments[i+1].hasMisc():
# This is a $s (periodical title) reference, followed immediately by its report number ($r). Concat them both under the $s.
out += self._segments[i].getSelfMARCXML(self._segments[i+1].getRN_subfield())
i = i + 1
elif isinstance(self._segments[i],InstitutePreprintReferenceCitation) and i < lenline-1 and isinstance(self._segments[i+1],TitleCitationStandard) and not self._segments[i+1].hasMisc():
# This is a report number ($r) reference followed immediately by its periodical title ($s) reference. Concat them both under $s.
out += self._segments[i].getSelfMARCXML(self._segments[i+1].getS_subfield())
i = i + 1
else:
out += self._segments[i].getSelfMARCXML()
i = i + 1
return out
def addSection(self, newSect):
if isinstance(newSect,LineItem):
self._segments[self._nextposn] = newSect
self._nextposn += 1
def getNumberCitations(self):
numcitations = 0
numsegments = len(self._segments)
for i in range(0,numsegments):
if isinstance(self._segments[i], Citation): numcitations += 1
return numcitations
class ProcessedReferenceSection:
"""This is a reference section after it has been processed to identify cited items. It contains a list of ProcessedReferenceLines."""
def __init__(self):
self._lines = {}
self._nextline = 0
def getSelfMARCXML(self):
"""Return a unicode string of all reference lines marked up in MARC XML"""
out = u""
numlines = len(self._lines)
for i in range(0,numlines): out += self._lines[i].getSelfMARCXML()
return out
def appendLine(self, ln):
"""Add a new line to the list of processed reference lines"""
if isinstance(ln, ProcessedReferenceLine):
self._lines[self._nextline] = ln
self._nextline += 1
def getTotalNumberCitations(self):
"""Return an integer representing the total number of citations recognised (and thus marked up) in the reference section"""
numcitations = 0
numlines = len(self._lines)
for i in range(0,numlines): numcitations += self._lines[i].getNumberCitations()
return numcitations
class NumerationHandler:
"""Class whose instances identify reference numeration patterns in a text line and rearrange them into standardised numeration patterns
Returns line with numeration patterns marked up in an XML style
"""
def __init__(self):
self._ptnList = []
self._checkAgainPtnList = []
self._ptn_seriesRemove = re.compile(unicode(r'((\<cds.TITLE\>)([^\<]+)(\<\/cds.TITLE\>)\s*.?\s*\<cds\.SER\>([A-H]|(I{1,3}V?|VI{0,3}))\<\/cds\.SER\>)'),re.UNICODE)
self._setSearchPatterns()
self._setRecheckPatterns()
def _setRecheckPatterns(self):
"""After the line has been rebuilt with marked up titles, it can be rechecked for numeration patterns because perhaps now more can be found with the aid of the recognised titles"""
self._checkAgainPtnList.append([re.compile(unicode(r'\(?([12]\d{3})([A-Za-z]?)\)?,? *(<cds\.TITLE>(\.|[^<])*<\/cds\.TITLE>),? *(\b[Vv]o?l?\.?)?\s?(\d+)(,\s*|\s+)[pP]?[p]?\.?\s?([RL]?\d+[c]?)\-?[RL]?\d{0,6}[c]?'),re.UNICODE),unicode('\\g<1>\\g<2>, \\g<3> \\g<6> (\\g<1>) \\g<8>')])
self._checkAgainPtnList.append([re.compile(unicode(r'\(?([12]\d{3})([A-Za-z]?)\)?,? *(<cds\.TITLE>(\.|[^<])*<\/cds\.TITLE>),? *(\b[Vv]o?l?\.?)?\s?(\d+)\s?([A-H])\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)\-?[RL]?\d{0,6}[c]?'),re.UNICODE),unicode('\\g<1>\\g<2>, \\g<3> \\g<6> \\g<7> \\g<8> (\\g<1>)')])
def _setSearchPatterns(self):
"""Populate self._ptnList with seek/replace numeration pattern pairs"""
# Delete the colon and expressions as Serie, vol, V. inside the pattern <serie : volume>
self._ptnList.append([re.compile(unicode(r'(Serie\s|\bS\.?\s)?([A-H])\s?[:,]\s?(\b[Vv]o?l?\.?)?\s?(\d+)'),re.UNICODE),unicode('\\g<2> \\g<4>')])
# Use 4 different patterns to standardise numeration as <serie(?) : volume (year) page>
# Pattern 1: <x, vol, year, page>
self._ptnList.append([re.compile(unicode(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?\(([1-2]\d\d\d)\),?\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?'),re.UNICODE), unicode(' : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<3>)</cds.YR> <cds.PG>\\g<4></cds.PG> ')])
# Pattern 2: <vol, serie, year, page>
self._ptnList.append([re.compile(unicode(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?([A-H])\s?\(([1-2]\d\d\d)\),?\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?'),re.UNICODE), unicode(' <cds.SER>\\g<3></cds.SER> : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<4>)</cds.YR> <cds.PG>\\g<5></cds.PG> ')])
# Pattern 3: <x, vol, page, year>
self._ptnList.append([re.compile(unicode(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?[,:]\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?,?\s?\(?([1-2]\d\d\d)\)?'),re.UNICODE), unicode(' : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<4>)</cds.YR> <cds.PG>\\g<3></cds.PG> ')])
# Pattern 4: <vol, serie, page, year>
self._ptnList.append([re.compile(unicode(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?([A-H])[,:\s]\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?,?\s?\(([1-2]\d\d\d)\)'),re.UNICODE), unicode(' <cds.SER>\\g<3></cds.SER> : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<5>)</cds.YR> <cds.PG>\\g<4></cds.PG> ')])
def removeSeriesTags(self, ln):
"""Remove any "<cds.SER/>" tags from a line. Series information should be part of a title, not separate"""
m_seriesTagLine = self._ptn_seriesRemove.search(ln)
while m_seriesTagLine is not None:
whole_match = m_seriesTagLine.group(0)
title_tag_opener = m_seriesTagLine.group(2)
title_text = m_seriesTagLine.group(3)
title_tag_closer = m_seriesTagLine.group(4)
series_letter = m_seriesTagLine.group(5)
real_title_text = title_text
# If there is no comma in the matched title, add one to the end of it before series info added. If there is already a comma present, simply discard the series info
if string.find(real_title_text,u",") != -1:
real_title_text = string.rstrip(real_title_text)
if real_title_text[-1] == u".":
real_title_text = real_title_text + u", " + series_letter
else:
real_title_text = real_title_text + u"., " + series_letter
ln = re.sub(u"%s"%(re.escape(whole_match),),u"%s%s%s"%(title_tag_opener,real_title_text,title_tag_closer),ln,1)
m_seriesTagLine = self._ptn_seriesRemove.search(ln)
return ln
def restandardise(self, ln):
"""Given that some more titles have been recognised within a line, reprocess that line in the hopes of recognising more numeration patterns"""
for x in self._checkAgainPtnList:
ln = x[0].sub(x[1], ln)
return self.standardise(ln)
def standardise(self, ln):
"""Accept ln (text line) as argument. Perform transformations on this line to replace non-standard numeration styles with marked-up versions in a standard format.
These recognised and marked-up numeration patterns can later be used to identify cited documents
"""
for x in self._ptnList:
ln = x[0].sub(x[1], ln)
return ln
class LineCleaner:
"""Class to enable lines to be cleaned of punctuation errors"""
def __init__(self):
self._correctionList = {}
self._setCorrectionList()
def _setCorrectionList(self):
"""Set the list of punctuation (etc) errors in a line to be corrected"""
self._correctionList[re.compile(unicode(r'\s,'),re.UNICODE)] = u','
self._correctionList[re.compile(unicode(r'\s;'),re.UNICODE)] = u';'
self._correctionList[re.compile(unicode(r'\s\.'),re.UNICODE)] = u'.'
self._correctionList[re.compile(unicode(r':\s:'),re.UNICODE)] = u':'
self._correctionList[re.compile(unicode(r',\s:'),re.UNICODE)] = u':'
self._correctionList[re.compile(unicode(r'\s\]'),re.UNICODE)] = u']'
self._correctionList[re.compile(unicode(r'\[\s'),re.UNICODE)] = u'['
self._correctionList[re.compile(unicode(r'\\255'),re.UNICODE)] = u'-' # Hyphen symbols
self._correctionList[re.compile(u'\u02D7',re.UNICODE)] = u'-'
self._correctionList[re.compile(u'\u0335',re.UNICODE)] = u'-'
self._correctionList[re.compile(u'\u0336',re.UNICODE)] = u'-'
self._correctionList[re.compile(u'\u2212',re.UNICODE)] = u'-'
self._correctionList[re.compile(u'\u002D',re.UNICODE)] = u'-'
self._correctionList[re.compile(u'\uFE63',re.UNICODE)] = u'-'
self._correctionList[re.compile(u'\uFF0D',re.UNICODE)] = u'-'
def clean(self, ln):
# Remove double spaces:
p_dblSpace = re.compile(unicode(r'\s{2,}'),re.UNICODE)
ln = p_dblSpace.sub(u' ', ln)
# Correct other bad punctuation:
for x in self._correctionList.keys():
ln = x.sub(self._correctionList[x], ln)
return ln
class PunctuationStripper:
"""Class to strip punctuation characters from a line & replace them with a space character"""
def __init__(self):
self._punct = re.compile(unicode(r'[\.\,\;\'\(\)\-]'),re.UNICODE)
self._rep = u' '
def strip(self, ln):
return self._punct.sub(self._rep, ln)
class MultispaceRemover:
"""Class to remove all ocurrences of multiple spaces from a line and replace them with a single space while recording information about their positioning"""
def __init__(self):
self._spcPtn = re.compile(unicode(r'(\s{2,})'),re.UNICODE)
def recordRemove(self, ln):
removedSpaces = {} # Records posn of removed multispace & length of truncation
fromPos = 0 # Posn in line from which to check for multispaces
# Search for multispace:
ms_matches = self._spcPtn.finditer(ln)
for m in ms_matches:
removedSpaces[m.start()] = m.end() - m.start() - 1
ln = self._spcPtn.sub(u' ', ln)
# Return a tuple of 2 items: a dictionary containing the removed multispace info,
# and the line itself after the multispaces have been converted to single spaces
return (removedSpaces, ln)
def getFileList(fname):
"""Return a list of files to be processed"""
flist = []
if os.access(fname, os.R_OK):
try:
f = open(fname, "r")
for line in f:
flist.append(line.strip())
f.close()
except IOError:
return None
return flist
else:
return None
def getRecidFilenames(args):
files = []
for x in args:
items = string.split(x, ":")
if len(items) != 2:
sys.stderr.write(u"W: Recid:filepath argument invalid. Skipping.\n")
continue
files.append((items[0],items[1]))
return files
def main():
myoptions, myargs = getopt.getopt(sys.argv[1:], "hV", ["help","version"])
for o in myoptions:
if o[0] in ("-V","--version"):
sys.stderr.write("%s\n" % (SystemMessage().getVersionMessage(),)) # Version message and stop
sys.exit(0)
elif o[0] in ("-h","--help"):
sys.stderr.write("%s\n" % (SystemMessage().getHelpMessage(),)) # Help message and stop
sys.exit(0)
if len(myargs) == 0:
sys.stderr.write("%s\n" % (SystemMessage().getHelpMessage(),)) # Help message and stop
sys.exit(0)
recidfiles = getRecidFilenames(myargs)
if len(recidfiles) == 0:
sys.stderr.write("%s\n" % (SystemMessage().getHelpMessage(),)) # Help message and stop
sys.exit(0)
converterList=[PDFtoTextDocumentConverter()] # List of document converters to use
titles_kb = KnowledgeBase(fn = cfg_refextract_kb_journal_titles)
institutes = InstituteList(fn = cfg_refextract_kb_report_numbers)
refSect_processor = ReferenceSectionMarkupProcessor(institutes, titles_kb)
openxmltag = u"""<?xml version="1.0" encoding="UTF-8"?>"""
opencollectiontag = u"""<collection xmlns="http://www.loc.gov/MARC21/slim">"""
closecollectiontag = u"""</collection>\n"""
done_coltags = False
for curitem in recidfiles:
# Perform required processing (according to stages):
if not os.access(curitem[1], os.F_OK):
# path to file invalid
sys.stderr.write("E: File Path %s invalid! Ignored.\n" % (curitem,))
continue
doc = None
if len(converterList) < 1:
sys.stderr.write("E: No document converter tools available - cannot process reference extraction.\n" % (curitem,))
sys.exit(1)
# Convert file to text:
for conv in converterList:
doc = conv.convertDocument(curitem[1])
try:
if not doc.isEmpty():
break
except AttributeError:
pass
if doc is None:
sys.stderr.write("""W: File "%s" cannot be converted to plain-text. Cannot be processed.\n""" % (curitem,))
continue
# Do "Extract References" Stage
try:
if doc.isEmpty():
sys.stderr.write("""W: File "%s" appears to be empty or cannot be read-in. Cannot be processed.\n""" % (curitem,))
continue
except AttributeError:
sys.stderr.write("""W: File "%s" appears to be empty or cannot be read-in. Cannot be processed.\n""" % (curitem,))
continue
if doc is None:
sys.stderr.write("""W: File "%s" appears to be empty or cannot be read-in. Cannot be processed.\n""" % (curitem,))
continue
refSection = doc.extractReferences()
if not done_coltags and not refSection.isEmpty():
# Output collection tags:
sys.stdout.write("%s\n" % (openxmltag.encode("utf-8"),))
sys.stdout.write("%s\n" % (opencollectiontag.encode("utf-8"),))
done_coltags = True
# Do citation title standardisation stage
processedReferenceSection = refSect_processor.getProcessedReferenceSection(refSection)
ReferenceSectionDisplayer().display(processedReferenceSection, curitem[0])
if done_coltags:
sys.stdout.write("%s\n" % (closecollectiontag.encode("utf-8"),))

Event Timeline