if (mk_match2 is not None) and (string.atoi(mk_match2.group('num')) == 2) and (mk_match.group('left') == mk_match2.group('left')) and (mk_match.group('right') == mk_match2.group('right')):
# Found next line:
tmpFnd = 1
elif y == len(self._content) - 1:
tmpFnd = 1
y = y + 1
if tmpFnd:
foundSect = 1
refStartLine = x
refLineMarker = mk_match.group('mark')
refLineMarkerPattern = mk_ptn
x = x - 1
if refStartLine is not None:
# Make ReferenceSectionStartPoint object with ref section start location details
refStart = ReferenceSectionStartPoint()
refStart.setLineNum(refStartLine)
refStart.setMarkerChar(refLineMarker)
refStart.setMarkerPattern(refLineMarkerPattern)
return refStart
def findReferenceSection(self):
"""Find the line number of the start of a TextDocument object's reference section. Returns a 'ReferenceSectionStartPoint'
object containing details of the reference section start line number, the reference section title & the marker char
used for each reference line or returns None if not found
"""Exception thrown if a line in the periodicals knowledge base does not comply with the expected format"""
pass
class KnowledgeBase:
"""The knowledge base of periodical titles. Consists of search & replace terms. The search terms consist of non-standard periodical titles in upper case.
These are often found in the text of documents. Replacement terms consist of standardised periodical titles in a standardised case. These will be used to
replace identified non-standard titles
"""
def __init__(self, fn = None):
self._kb = {}
self._compiledPatternsKB = {}
self._unstandardisedTitle = {}
if type(fn) is str: self._buildKB(fn)
def _buildKB(self, fn):
"""From the filename provided (fn), read the periodicals knowledge base into memory, and build a dictionary of seek/replace values to be stored in self._kb"""
"""Identify periodical titles in text line 'ln' and record information about where in the line they occur. Replace them for lower-case versions or
lowercase letter 'a's if the match was numerical. Return a Tuple containing dictionaries containing information about the substitutions, along with the new line
"""Accept a user-defined search pattern, transform it, according to some simple rules, into a regex pattern, then compile and return it as a compiled RE object
"""For a replacement index position, calculate a new (correct) replacement index, based on any spaces that have been removed before it, according to the type of the replacement"""
for strip_space in spacesKeys:
if strip_space < true_repidx:
# Spaces were removed before this replacement item should be placed. Add number of spaces removed to current replacement idx:
# All spaces removed before this URL accounted for - add it to the dictionary
uri_virtual_locations[virtual_pos] = idx
return uri_virtual_locations
class ReferenceSectionMarkupProcessor:
"""Process a reference section. Line will be cleaned, and cited items will be identified and their notation standardised. ProcessedReferenceLine will be returned"""
out = u""" <datafield tag="999" ind1="C" ind2="5">\n"""
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str):
out += u""" <subfield code="m">"""+cgi.escape(self._misc)+u"""</subfield>\n"""
out += u""" <subfield code="r">"""+cgi.escape(self._rn)+u"""</subfield>\n"""
if xtra_subfield is not None:
out += xtra_subfield
out += u""" </datafield>\n"""
return out
class URLCitation(Citation):
def __init__(self, url, urldescr, misc=None):
self._url = url
self._urldescr = urldescr
if misc is not None: self._misc = misc.strip()
else: self._misc = misc
def getSelfMARCXML(self):
out = u""" <datafield tag="999" ind1="C" ind2="5">\n"""
if self._misc is not None and (type(self._misc) is unicode or type(self._misc) is str):
out += u""" <subfield code="m">"""+cgi.escape(self._misc)+u"""</subfield>\n"""
out += u""" <subfield code="u">"""+cgi.escape(self._url)+u"""</subfield>\n"""
out += u""" <subfield code="z">"""+cgi.escape(self._urldescr)+u"""</subfield>\n"""
out += u""" </datafield>\n"""
return out
class ProcessedReferenceLine:
"""This is a reference line that has been processed for cited items"""
def __init__(self):
self._segments = {} # Segments of reference line, each keyed by start point index. Each is a 'LineItem'.
self._nextposn = 0
def getSelfMARCXML(self):
"""Return an XML string containing this lines contents, marked up in XML MARC, as used in CDS"""
i = 0
lenline = len(self._segments)
out = u""
while i < lenline:
if isinstance(self._segments[i],TitleCitationStandard) and i < lenline-1 and isinstance(self._segments[i+1],InstitutePreprintReferenceCitation) and not self._segments[i+1].hasMisc():
# This is a $s (periodical title) reference, followed immediately by its report number ($r). Concat them both under the $s.
out += self._segments[i].getSelfMARCXML(self._segments[i+1].getRN_subfield())
i = i + 1
elif isinstance(self._segments[i],InstitutePreprintReferenceCitation) and i < lenline-1 and isinstance(self._segments[i+1],TitleCitationStandard) and not self._segments[i+1].hasMisc():
# This is a report number ($r) reference followed immediately by its periodical title ($s) reference. Concat them both under $s.
out += self._segments[i].getSelfMARCXML(self._segments[i+1].getS_subfield())
i = i + 1
else:
out += self._segments[i].getSelfMARCXML()
i = i + 1
return out
def addSection(self, newSect):
if isinstance(newSect,LineItem):
self._segments[self._nextposn] = newSect
self._nextposn += 1
def getNumberCitations(self):
numcitations = 0
numsegments = len(self._segments)
for i in range(0,numsegments):
if isinstance(self._segments[i], Citation): numcitations += 1
return numcitations
class ProcessedReferenceSection:
"""This is a reference section after it has been processed to identify cited items. It contains a list of ProcessedReferenceLines."""
def __init__(self):
self._lines = {}
self._nextline = 0
def getSelfMARCXML(self):
"""Return a unicode string of all reference lines marked up in MARC XML"""
out = u""
numlines = len(self._lines)
for i in range(0,numlines): out += self._lines[i].getSelfMARCXML()
return out
def appendLine(self, ln):
"""Add a new line to the list of processed reference lines"""
if isinstance(ln, ProcessedReferenceLine):
self._lines[self._nextline] = ln
self._nextline += 1
def getTotalNumberCitations(self):
"""Return an integer representing the total number of citations recognised (and thus marked up) in the reference section"""
numcitations = 0
numlines = len(self._lines)
for i in range(0,numlines): numcitations += self._lines[i].getNumberCitations()
return numcitations
class NumerationHandler:
"""Class whose instances identify reference numeration patterns in a text line and rearrange them into standardised numeration patterns
Returns line with numeration patterns marked up in an XML style
"""After the line has been rebuilt with marked up titles, it can be rechecked for numeration patterns because perhaps now more can be found with the aid of the recognised titles"""
# If there is no comma in the matched title, add one to the end of it before series info added. If there is already a comma present, simply discard the series info
"""Given that some more titles have been recognised within a line, reprocess that line in the hopes of recognising more numeration patterns"""
for x in self._checkAgainPtnList:
ln = x[0].sub(x[1], ln)
return self.standardise(ln)
def standardise(self, ln):
"""Accept ln (text line) as argument. Perform transformations on this line to replace non-standard numeration styles with marked-up versions in a standard format.
These recognised and marked-up numeration patterns can later be used to identify cited documents
"""
for x in self._ptnList:
ln = x[0].sub(x[1], ln)
return ln
class LineCleaner:
"""Class to enable lines to be cleaned of punctuation errors"""
def __init__(self):
self._correctionList = {}
self._setCorrectionList()
def _setCorrectionList(self):
"""Set the list of punctuation (etc) errors in a line to be corrected"""
"""Class to remove all ocurrences of multiple spaces from a line and replace them with a single space while recording information about their positioning"""