"""Exception thrown if a line in the periodicals knowledge base does not comply with the expected format"""
pass
classKnowledgeBase:
"""The knowledge base of periodical titles. Consists of search & replace terms. The search terms consist of non-standard periodical titles in upper case.
These are often found in the text of documents. Replacement terms consist of standardised periodical titles in a standardised case. These will be used to
replace identified non-standard titles
"""
def__init__(self,fn=None):
self._kb={}
self._compiledPatternsKB={}
self._unstandardisedTitle={}
iftype(fn)isstr:self._buildKB(fn)
def_buildKB(self,fn):
"""From the filename provided (fn), read the periodicals knowledge base into memory, and build a dictionary of seek/replace values to be stored in self._kb"""
"""Identify periodical titles in text line 'ln' and record information about where in the line they occur. Replace them for lower-case versions or
lowercase letter 'a's if the match was numerical. Return a Tuple containing dictionaries containing information about the substitutions, along with the new line
"""Make a copy of the list of numeration patterns for an Institute object. Return this new list"""
nl=[]
ccp=re.compile(unicode(r'\[[^\]]+\]'),re.UNICODE)
forxinself._numerationList:
# Remove the character class & append to newList
nx=ccp.sub(u'1',x)
nl.append((len(nx),x))
returnnl
def_createPattern(self,ptn):
"""Accept a user-defined search pattern, transform it, according to some simple rules, into a regex pattern, then compile and return it as a compiled RE object
"""For a replacement index position, calculate a new (correct) replacement index, based on any spaces that have been removed before it, according to the type of the replacement"""
forstrip_spaceinspacesKeys:
ifstrip_space<true_repidx:
# Spaces were removed before this replacement item should be placed. Add number of spaces removed to current replacement idx:
"""URLs were removed before punctuation and multiple spaces were recorded and stripped. This method makes a dictionary of
URL positions as-if the URLs had been identified/removed after the punctuation/spaces
"""
uri_virtual_locations={}
foridxinurl_keys:
virtual_pos=idx
forspcidxinspaces_keys:
ifspcidx<idx:
# Spaces were removed before this URL. Account for this.
virtual_pos=virtual_pos-removed_spaces[spcidx]
# All spaces removed before this URL accounted for - add it to the dictionary
uri_virtual_locations[virtual_pos]=idx
returnuri_virtual_locations
classReferenceSectionMarkupProcessor:
"""Process a reference section. Line will be cleaned, and cited items will be identified and their notation standardised. ProcessedReferenceLine will be returned"""
"""After the line has been rebuilt with marked up titles, it can be rechecked for numeration patterns because perhaps now more can be found with the aid of the recognised titles"""
"""Remove any "<cds.SER/>" tags from a line. Series information should be part of a title, not separate"""
m_seriesTagLine=self._ptn_seriesRemove.search(ln)
whilem_seriesTagLineisnotNone:
whole_match=m_seriesTagLine.group(0)
title_tag_opener=m_seriesTagLine.group(2)
title_text=m_seriesTagLine.group(3)
title_tag_closer=m_seriesTagLine.group(4)
series_letter=m_seriesTagLine.group(5)
real_title_text=title_text
# If there is no comma in the matched title, add one to the end of it before series info added. If there is already a comma present, simply discard the series info
"""Given that some more titles have been recognised within a line, reprocess that line in the hopes of recognising more numeration patterns"""
forxinself._checkAgainPtnList:
ln=x[0].sub(x[1],ln)
returnself.standardise(ln)
defstandardise(self,ln):
"""Accept ln (text line) as argument. Perform transformations on this line to replace non-standard numeration styles with marked-up versions in a standard format.
These recognised and marked-up numeration patterns can later be used to identify cited documents
"""
forxinself._ptnList:
ln=x[0].sub(x[1],ln)
returnln
classLineCleaner:
"""Class to enable lines to be cleaned of punctuation errors"""
def__init__(self):
self._correctionList={}
self._setCorrectionList()
def_setCorrectionList(self):
"""Set the list of punctuation (etc) errors in a line to be corrected"""
"""Class to remove all ocurrences of multiple spaces from a line and replace them with a single space while recording information about their positioning"""