diff --git a/modules/bibedit/lib/refextract.py b/modules/bibedit/lib/refextract.py
index 916937b02..8d305aa8d 100644
--- a/modules/bibedit/lib/refextract.py
+++ b/modules/bibedit/lib/refextract.py
@@ -1,4075 +1,4098 @@
 # -*- coding: utf-8 -*-
 ##
 ## $Id$
 ##
 ## This file is part of CDS Invenio.
 ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 CERN.
 ##
 ## CDS Invenio is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## CDS Invenio is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.  
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """This is the main body of refextract. It is used to extract references from
    fulltext PDF documents.
 """
 
 __revision__ = "$Id$"
 
 try:
     import sys, sre
     import os, getopt
     from time import mktime, localtime
     from invenio.refextract_config \
            import CFG_REFEXTRACT_VERSION, \
                   CFG_REFEXTRACT_KB_JOURNAL_TITLES, \
                   CFG_REFEXTRACT_KB_REPORT_NUMBERS, \
                   CFG_REFEXTRACT_CTRL_FIELD_RECID, \
                   CFG_REFEXTRACT_TAG_ID_REFERENCE, \
                   CFG_REFEXTRACT_IND1_REFERENCE, \
                   CFG_REFEXTRACT_IND2_REFERENCE, \
                   CFG_REFEXTRACT_SUBFIELD_MARKER, \
                   CFG_REFEXTRACT_SUBFIELD_MISC, \
                   CFG_REFEXTRACT_SUBFIELD_REPORT_NUM, \
                   CFG_REFEXTRACT_SUBFIELD_TITLE, \
                   CFG_REFEXTRACT_SUBFIELD_URL, \
                   CFG_REFEXTRACT_SUBFIELD_URL_DESCR, \
                   CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS, \
                   CFG_REFEXTRACT_IND1_EXTRACTION_STATS, \
                   CFG_REFEXTRACT_IND2_EXTRACTION_STATS, \
                   CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS, \
                   CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM, \
                   CFG_REFEXTRACT_MARKER_CLOSING_TITLE, \
                   CFG_REFEXTRACT_MARKER_CLOSING_SERIES, \
                   CFG_REFEXTRACT_MARKER_CLOSING_VOLUME, \
                   CFG_REFEXTRACT_MARKER_CLOSING_YEAR, \
                   CFG_REFEXTRACT_MARKER_CLOSING_PAGE, \
                   CFG_REFEXTRACT_MARKER_CLOSING_URL, \
                   CFG_REFEXTRACT_XML_VERSION, \
                   CFG_REFEXTRACT_XML_COLLECTION_OPEN, \
                   CFG_REFEXTRACT_XML_COLLECTION_CLOSE, \
                   CFG_REFEXTRACT_XML_RECORD_OPEN, \
                   CFG_REFEXTRACT_XML_RECORD_CLOSE
 
     from invenio.config import CFG_PATH_GFILE, CFG_PATH_PDFTOTEXT
     from invenio.search_engine import encode_for_xml
 except ImportError, importerror:
     import sys
     sys.stderr.write("Error: %s" % importerror)
     sys.stderr.flush()
     sys.exit(1)
 
 cli_opts = {}
 
 def get_url_repair_patterns():
     """Initialise and return a list of precompiled regexp patterns that are used to
        try to re-assemble URLs that have been broken during a document's conversion
        to plain-text.
        @return: (list) of compiled sre regexp patterns used for finding various
         broken URLs.
     """
     file_types_list = []
     file_types_list.append(r'h\s*?t\s*?m')           ## htm
     file_types_list.append(r'h\s*?t\s*?m\s*?l')      ## html
     file_types_list.append(r't\s*?x\s*?t')           ## txt
     file_types_list.append(r'p\s*?h\s*?p')           ## php
     file_types_list.append(r'a\s*?s\s*?p\s*?')       ## asp
     file_types_list.append(r'j\s*?s\s*?p')           ## jsp
     file_types_list.append(r'p\s*?y')                ## py (python)
     file_types_list.append(r'p\s*?l')                ## pl (perl)
     file_types_list.append(r'x\s*?m\s*?l')           ## xml
     file_types_list.append(r'j\s*?p\s*?g')           ## jpg
     file_types_list.append(r'g\s*?i\s*?f')           ## gif
     file_types_list.append(r'm\s*?o\s*?v')           ## mov
     file_types_list.append(r's\s*?w\s*?f')           ## swf
     file_types_list.append(r'p\s*?d\s*?f')           ## pdf
     file_types_list.append(r'p\s*?s')                ## ps
     file_types_list.append(r'd\s*?o\s*?c')           ## doc
     file_types_list.append(r't\s*?e\s*?x')           ## tex
     file_types_list.append(r's\s*?h\s*?t\s*?m\s*?l') ## shtml
     pattern_list = []
     pattern_list.append(sre.compile(r'(h\s*t\s*t\s*p\s*\:\s*\/\s*\/)', \
                                     sre.I|sre.UNICODE))
     pattern_list.append(sre.compile(r'(f\s*t\s*p\s*\:\s*\/\s*\/\s*)', \
                                     sre.I|sre.UNICODE))
     pattern_list.append(sre.compile(r'((http|ftp):\/\/\s*[\w\d])', \
                                     sre.I|sre.UNICODE))
     pattern_list.append(sre.compile(r'((http|ftp):\/\/([\w\d\s\._\-])+?\s*\/)', \
                                     sre.I|sre.UNICODE))
     pattern_list.append(sre.compile(r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)+)', \
                                     sre.I|sre.UNICODE))
     p_url = \
      sre.compile(r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)*([\w\d\_\s\-]+\.\s?[\w\d]+))', \
       sre.I|sre.UNICODE)
     pattern_list.append(p_url)
     ## some possible endings for URLs:
     for x in file_types_list:
         p_url = \
             sre.compile(\
               r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*([\w\d\_\-]+\.' + x + u'))', \
               sre.I|sre.UNICODE)
         pattern_list.append(p_url)
     ## if url last thing in line, and only 10 letters max, concat them
     p_url = \
         sre.compile(\
           r'((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*\s*?([\w\d\_\.\-]\s?){1,10}\s*)$', \
           sre.I|sre.UNICODE)
     pattern_list.append(p_url)
     return pattern_list
 
 def get_bad_char_replacements():
     """When a document is converted to plain-text from PDF, certain characters may result in the
        plain-text, that are either unwanted, or broken. These characters need to be corrected or
        removed. Examples are, certain control characters that would be illegal in XML and must be
        removed; TeX ligatures (etc); broken accents such as umlauts on letters that must be corrected.
        This function returns a dictionary of (unwanted) characters to look for and the characters
        that should be used to replace them.
        @return: (dictionary) - { seek -> replace, } or charsacters to replace in plain-text.
     """
     replacements = {
         ## Control characters not allowed in XML:
         u'\u2028' : u"",
         u'\u2029' : u"",
         u'\u202A' : u"",
         u'\u202B' : u"",
         u'\u202C' : u"",
         u'\u202D' : u"",
         u'\u202E' : u"",
         u'\u206A' : u"",
         u'\u206B' : u"",
         u'\u206C' : u"",
         u'\u206D' : u"",
         u'\u206E' : u"",
         u'\u206F' : u"",
         u'\uFFF9' : u"",
         u'\uFFFA' : u"",
         u'\uFFFB' : u"",
         u'\uFFFC' : u"",
         u'\uFEFF' : u"",
         ## Language Tag Code Points:
         u"\U000E0000" : u"",
         u"\U000E0001" : u"",
         u"\U000E0002" : u"",
         u"\U000E0003" : u"",
         u"\U000E0004" : u"",
         u"\U000E0005" : u"",
         u"\U000E0006" : u"",
         u"\U000E0007" : u"",
         u"\U000E0008" : u"",
         u"\U000E0009" : u"",
         u"\U000E000A" : u"",
         u"\U000E000B" : u"",
         u"\U000E000C" : u"",
         u"\U000E000D" : u"",
         u"\U000E000E" : u"",
         u"\U000E000F" : u"",
         u"\U000E0010" : u"",
         u"\U000E0011" : u"",
         u"\U000E0012" : u"",
         u"\U000E0013" : u"",
         u"\U000E0014" : u"",
         u"\U000E0015" : u"",
         u"\U000E0016" : u"",
         u"\U000E0017" : u"",
         u"\U000E0018" : u"",
         u"\U000E0019" : u"",
         u"\U000E001A" : u"",
         u"\U000E001B" : u"",
         u"\U000E001C" : u"",
         u"\U000E001D" : u"",
         u"\U000E001E" : u"",
         u"\U000E001F" : u"",
         u"\U000E0020" : u"",
         u"\U000E0021" : u"",
         u"\U000E0022" : u"",
         u"\U000E0023" : u"",
         u"\U000E0024" : u"",
         u"\U000E0025" : u"",
         u"\U000E0026" : u"",
         u"\U000E0027" : u"",
         u"\U000E0028" : u"",
         u"\U000E0029" : u"",
         u"\U000E002A" : u"",
         u"\U000E002B" : u"",
         u"\U000E002C" : u"",
         u"\U000E002D" : u"",
         u"\U000E002E" : u"",
         u"\U000E002F" : u"",
         u"\U000E0030" : u"",
         u"\U000E0031" : u"",
         u"\U000E0032" : u"",
         u"\U000E0033" : u"",
         u"\U000E0034" : u"",
         u"\U000E0035" : u"",
         u"\U000E0036" : u"",
         u"\U000E0037" : u"",
         u"\U000E0038" : u"",
         u"\U000E0039" : u"",
         u"\U000E003A" : u"",
         u"\U000E003B" : u"",
         u"\U000E003C" : u"",
         u"\U000E003D" : u"",
         u"\U000E003E" : u"",
         u"\U000E003F" : u"",
         u"\U000E0040" : u"",
         u"\U000E0041" : u"",
         u"\U000E0042" : u"",
         u"\U000E0043" : u"",
         u"\U000E0044" : u"",
         u"\U000E0045" : u"",
         u"\U000E0046" : u"",
         u"\U000E0047" : u"",
         u"\U000E0048" : u"",
         u"\U000E0049" : u"",
         u"\U000E004A" : u"",
         u"\U000E004B" : u"",
         u"\U000E004C" : u"",
         u"\U000E004D" : u"",
         u"\U000E004E" : u"",
         u"\U000E004F" : u"",
         u"\U000E0050" : u"",
         u"\U000E0051" : u"",
         u"\U000E0052" : u"",
         u"\U000E0053" : u"",
         u"\U000E0054" : u"",
         u"\U000E0055" : u"",
         u"\U000E0056" : u"",
         u"\U000E0057" : u"",
         u"\U000E0058" : u"",
         u"\U000E0059" : u"",
         u"\U000E005A" : u"",
         u"\U000E005B" : u"",
         u"\U000E005C" : u"",
         u"\U000E005D" : u"",
         u"\U000E005E" : u"",
         u"\U000E005F" : u"",
         u"\U000E0060" : u"",
         u"\U000E0061" : u"",
         u"\U000E0062" : u"",
         u"\U000E0063" : u"",
         u"\U000E0064" : u"",
         u"\U000E0065" : u"",
         u"\U000E0066" : u"",
         u"\U000E0067" : u"",
         u"\U000E0068" : u"",
         u"\U000E0069" : u"",
         u"\U000E006A" : u"",
         u"\U000E006B" : u"",
         u"\U000E006C" : u"",
         u"\U000E006D" : u"",
         u"\U000E006E" : u"",
         u"\U000E006F" : u"",
         u"\U000E0070" : u"",
         u"\U000E0071" : u"",
         u"\U000E0072" : u"",
         u"\U000E0073" : u"",
         u"\U000E0074" : u"",
         u"\U000E0075" : u"",
         u"\U000E0076" : u"",
         u"\U000E0077" : u"",
         u"\U000E0078" : u"",
         u"\U000E0079" : u"",
         u"\U000E007A" : u"",
         u"\U000E007B" : u"",
         u"\U000E007C" : u"",
         u"\U000E007D" : u"",
         u"\U000E007E" : u"",
         u"\U000E007F" : u"",
         ## Musical Notation Scoping
         u"\U0001D173" : u"",
         u"\U0001D174" : u"",
         u"\U0001D175" : u"",
         u"\U0001D176" : u"",
         u"\U0001D177" : u"",
         u"\U0001D178" : u"",
         u"\U0001D179" : u"",
         u"\U0001D17A" : u"",
         u'\u0001' : u"", ## START OF HEADING
         ## START OF TEXT & END OF TEXT:
         u'\u0002' : u"",
         u'\u0003' : u"",
         u'\u0004' : u"", ## END OF TRANSMISSION
         ## ENQ and ACK
         u'\u0005' : u"",
         u'\u0006' : u"",
         u'\u0007' : u"",     # BELL
         u'\u0008' : u"",     # BACKSPACE
         ## SHIFT-IN & SHIFT-OUT
         u'\u000E' : u"",
         u'\u000F' : u"",
         ## Other controls:
         u'\u0010' : u"", ## DATA LINK ESCAPE
         u'\u0011' : u"", ## DEVICE CONTROL ONE
         u'\u0012' : u"", ## DEVICE CONTROL TWO
         u'\u0013' : u"", ## DEVICE CONTROL THREE
         u'\u0014' : u"", ## DEVICE CONTROL FOUR
         u'\u0015' : u"", ## NEGATIVE ACK
         u'\u0016' : u"", ## SYNCRONOUS IDLE
         u'\u0017' : u"", ## END OF TRANSMISSION BLOCK
         u'\u0018' : u"", ## CANCEL
         u'\u0019' : u"", ## END OF MEDIUM
         u'\u001A' : u"", ## SUBSTITUTE
         u'\u001B' : u"", ## ESCAPE
         u'\u001C' : u"", ## INFORMATION SEPARATOR FOUR (file separator)
         u'\u001D' : u"", ## INFORMATION SEPARATOR THREE (group separator)
         u'\u001E' : u"", ## INFORMATION SEPARATOR TWO (record separator)
         u'\u001F' : u"", ## INFORMATION SEPARATOR ONE (unit separator)
         ## \r -> remove it
         u'\r' : u"",
         ## Strange parantheses - change for normal:
         u'\x1c'   : u'(',
         u'\x1d'   : u')',
         ## Some ff from tex:
         u'\u0013\u0010'   : u'\u00ED',
         u'\x0b'   : u'ff',
         ## fi from tex:
         u'\x0c'   : u'fi',
         ## ligatures from TeX:
         u'\ufb00' : u'ff',
         u'\ufb01' : u'fi',
         u'\ufb02' : u'fl',
         u'\ufb03' : u'ffi',
         u'\ufb04' : u'ffl',
         ## Superscripts from TeX
         u'\u2212' : u'-',
         u'\u2013' : u'-',
         ## Word style speech marks:
         u'\u201d' : u'"',
         u'\u201c' : u'"',
         ## pdftotext has problems with umlaut and prints it as diaeresis followed by a letter:correct it
         ## (Optional space between char and letter - fixes broken line examples)
         u'\u00A8 a' : u'\u00E4',
         u'\u00A8 e' : u'\u00EB',
         u'\u00A8 i' : u'\u00EF',
         u'\u00A8 o' : u'\u00F6',
         u'\u00A8 u' : u'\u00FC',
         u'\u00A8 y' : u'\u00FF',
         u'\u00A8 A' : u'\u00C4',
         u'\u00A8 E' : u'\u00CB',
         u'\u00A8 I' : u'\u00CF',
         u'\u00A8 O' : u'\u00D6',
         u'\u00A8 U' : u'\u00DC',
         u'\u00A8 Y' : u'\u0178',
         u'\xA8a' : u'\u00E4',
         u'\xA8e' : u'\u00EB',
         u'\xA8i' : u'\u00EF',
         u'\xA8o' : u'\u00F6',
         u'\xA8u' : u'\u00FC',
         u'\xA8y' : u'\u00FF',
         u'\xA8A' : u'\u00C4',
         u'\xA8E' : u'\u00CB',
         u'\xA8I' : u'\u00CF',
         u'\xA8O' : u'\u00D6',
         u'\xA8U' : u'\u00DC',
         u'\xA8Y' : u'\u0178',
         ## More umlaut mess to correct:
         u'\x7fa' : u'\u00E4',
         u'\x7fe' : u'\u00EB',
         u'\x7fi' : u'\u00EF',
         u'\x7fo' : u'\u00F6',
         u'\x7fu' : u'\u00FC',
         u'\x7fy' : u'\u00FF',
         u'\x7fA' : u'\u00C4',
         u'\x7fE' : u'\u00CB',
         u'\x7fI' : u'\u00CF',
         u'\x7fO' : u'\u00D6',
         u'\x7fU' : u'\u00DC',
         u'\x7fY' : u'\u0178',
         u'\x7f a' : u'\u00E4',
         u'\x7f e' : u'\u00EB',
         u'\x7f i' : u'\u00EF',
         u'\x7f o' : u'\u00F6',
         u'\x7f u' : u'\u00FC',
         u'\x7f y' : u'\u00FF',
         u'\x7f A' : u'\u00C4',
         u'\x7f E' : u'\u00CB',
         u'\x7f I' : u'\u00CF',
         u'\x7f O' : u'\u00D6',
         u'\x7f U' : u'\u00DC',
         u'\x7f Y' : u'\u0178',
         ## pdftotext: fix accute accent:
         u'\x13a' : u'\u00E1',
         u'\x13e' : u'\u00E9',
         u'\x13i' : u'\u00ED',
         u'\x13o' : u'\u00F3',
         u'\x13u' : u'\u00FA',
         u'\x13y' : u'\u00FD',
         u'\x13A' : u'\u00C1',
         u'\x13E' : u'\u00C9',
         u'\x13I' : u'\u00CD',
         u'\x13O' : u'\u00D3',
         u'\x13U' : u'\u00DA',
         u'\x13Y' : u'\u00DD',
         u'\x13 a' : u'\u00E1',
         u'\x13 e' : u'\u00E9',
         u'\x13 i' : u'\u00ED',
         u'\x13 o' : u'\u00F3',
         u'\x13 u' : u'\u00FA',
         u'\x13 y' : u'\u00FD',
         u'\x13 A' : u'\u00C1',
         u'\x13 E' : u'\u00C9',
         u'\x13 I' : u'\u00CD',
         u'\x13 O' : u'\u00D3',
         u'\x13 U' : u'\u00DA',
         u'\x13 Y' : u'\u00DD',
         u'\u00B4 a' : u'\u00E1',
         u'\u00B4 e' : u'\u00E9',
         u'\u00B4 i' : u'\u00ED',
         u'\u00B4 o' : u'\u00F3',
         u'\u00B4 u' : u'\u00FA',
         u'\u00B4 y' : u'\u00FD',
         u'\u00B4 A' : u'\u00C1',
         u'\u00B4 E' : u'\u00C9',
         u'\u00B4 I' : u'\u00CD',
         u'\u00B4 O' : u'\u00D3',
         u'\u00B4 U' : u'\u00DA',
         u'\u00B4 Y' : u'\u00DD',
         u'\u00B4a' : u'\u00E1',
         u'\u00B4e' : u'\u00E9',
         u'\u00B4i' : u'\u00ED',
         u'\u00B4o' : u'\u00F3',
         u'\u00B4u' : u'\u00FA',
         u'\u00B4y' : u'\u00FD',
         u'\u00B4A' : u'\u00C1',
         u'\u00B4E' : u'\u00C9',
         u'\u00B4I' : u'\u00CD',
         u'\u00B4O' : u'\u00D3',
         u'\u00B4U' : u'\u00DA',
         u'\u00B4Y' : u'\u00DD',
         ## pdftotext: fix grave accent:
         u'\u0060 a' : u'\u00E0',
         u'\u0060 e' : u'\u00E8',
         u'\u0060 i' : u'\u00EC',
         u'\u0060 o' : u'\u00F2',
         u'\u0060 u' : u'\u00F9',
         u'\u0060 A' : u'\u00C0',
         u'\u0060 E' : u'\u00C8',
         u'\u0060 I' : u'\u00CC',
         u'\u0060 O' : u'\u00D2',
         u'\u0060 U' : u'\u00D9',
         u'\u0060a' : u'\u00E0',
         u'\u0060e' : u'\u00E8',
         u'\u0060i' : u'\u00EC',
         u'\u0060o' : u'\u00F2',
         u'\u0060u' : u'\u00F9',
         u'\u0060A' : u'\u00C0',
         u'\u0060E' : u'\u00C8',
         u'\u0060I' : u'\u00CC',
         u'\u0060O' : u'\u00D2',
         u'\u0060U' : u'\u00D9',
         ## \02C7 : caron
         u'\u02C7C' : u'\u010C',
         u'\u02C7c' : u'\u010D',
         u'\u02C7S' : u'\u0160',
         u'\u02C7s' : u'\u0161',
         u'\u02C7Z' : u'\u017D',
         u'\u02C7z' : u'\u017E',
         ## \027 : aa (a with ring above)
         u'\u02DAa' : u'\u00E5',
         u'\u02DAA' : u'\u00C5',
         ## \030 : cedilla
         u'\u0327c' : u'\u00E7',
         u'\u0327C' : u'\u00C7',
         ## \02DC : tilde
         u'\u02DCn' : u'\u00F1',
         u'\u02DCN' : u'\u00D1',
         u'\u02DCo' : u'\u00F5',
         u'\u02DCO' : u'\u00D5',
         u'\u02DCa' : u'\u00E3',
         u'\u02DCA' : u'\u00C3',
     }
     return replacements
 
 ## precompile some often-used regexp for speed reasons:
 sre_regexp_character_class               = sre.compile(r'\[[^\]]+\]', sre.UNICODE)
 sre_space_comma                          = sre.compile(r'\s,', sre.UNICODE)
 sre_space_semicolon                      = sre.compile(r'\s;', sre.UNICODE)
 sre_space_period                         = sre.compile(r'\s\.', sre.UNICODE)
 sre_colon_space_colon                    = sre.compile(r':\s:', sre.UNICODE)
 sre_comma_space_colon                    = sre.compile(r',\s:', sre.UNICODE)
 sre_space_closing_square_bracket         = sre.compile(r'\s\]', sre.UNICODE)
 sre_opening_square_bracket_space         = sre.compile(r'\[\s', sre.UNICODE)
 sre_hyphens = sre.compile(r'(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)', sre.UNICODE)
 sre_multiple_hyphens                     = sre.compile(r'-{2,}', sre.UNICODE)
 sre_multiple_space                       = sre.compile(r'\s{2,}', sre.UNICODE)
 sre_group_captured_multiple_space        = sre.compile(r'(\s{2,})', sre.UNICODE)
 sre_colon_not_followed_by_numeration_tag = sre.compile(r':(?!\s*<cds)', sre.UNICODE|sre.I)
 
 
 ## Patterns used for creating institutional preprint report-number
 ## recognition patterns (used by function "institute_num_pattern_to_regex"):
    ## Recognise any character that isn't a->z, A->Z, 0->9, /, [, ], ' ', '"':
 sre_report_num_chars_to_escape = sre.compile(r'([^\]A-Za-z0-9\/\[ "])', sre.UNICODE)
    ## Replace "hello" with hello:
 sre_extract_quoted_text = (sre.compile(r'\"([^"]+)\"', sre.UNICODE), r'\g<1>',)
    ## Replace / [abcd ]/ with /( [abcd])?/ :
 sre_extract_char_class = (sre.compile(r' \[([^\]]+) \]', sre.UNICODE), r'( [\g<1>])?')
 ###
 
 
 ## URL recognition:
 ## Stand-alone URL (e.g. http //cdsware.cern.ch/ )
 sre_raw_url = \
  sre.compile(r'((https?|s?ftp) \/\/([\w\d\_\.\-])+(\/([\w\d\_\.\-])+)*(\/([\w\d\_\-]+\.\w{1,6})?)?)', \
              sre.UNICODE|sre.I)
 ## HTML marked-up URL (e.g. <a href="http //cdsware.cern.ch/">CERN Document Server Software Consortium</a> )
 sre_html_tagged_url = \
  sre.compile(r'(\<a\s+href\s*=\s*([\'"])?(((https?|s?ftp) \/\/)?([\w\d\_\.\-])+(\/([\w\d\_\.\-])+)*(\/([\w\d\_\-]+\.\w{1,6})?)?)([\'"])?\>([^\<]+)\<\/a\>)', \
              sre.UNICODE|sre.I)
 
 
 ## Numeration recognition pattern - used to identify numeration associated with a title when
 ## marking the title up into MARC XML:
 sre_recognised_numeration_for_title = \
      sre.compile(r'^(\s*\.?,?\s*:?\s\<cds\.VOL\>(\d+)\<\/cds\.VOL> \<cds\.YR\>\(([1-2]\d\d\d)\)\<\/cds\.YR\> \<cds\.PG\>([RL]?\d+[c]?)\<\/cds\.PG\>)', sre.UNICODE)
 
 sre_title_followed_by_series_markup_tags = \
      sre.compile(r'(\<cds.TITLE\>([^\<]+)\<\/cds.TITLE\>\s*.?\s*\<cds\.SER\>([A-H]|(I{1,3}V?|VI{0,3}))\<\/cds\.SER\>)', sre.UNICODE)
 
 sre_punctuation = sre.compile(r'[\.\,\;\'\(\)\-]', sre.UNICODE)
 
 #sre_tagged_citation = sre.compile(r'\<cds\.(TITLE|VOL|YR|PG|REPORTNUMBER|SER|URL).*?\>', sre.UNICODE)
 sre_tagged_citation = sre.compile(r'\<cds\.(TITLE|VOL|YR|PG|REPORTNUMBER|SER|URL)( description=\"[^\"]*\")?\>', sre.UNICODE)
 
 ## is there pre-recognised numeration-tagging within a few characters of the start if this part of the line?
 sre_tagged_numeration_near_line_start = sre.compile(r'^.{0,4}?<CDS (VOL|SER)>', sre.UNICODE)
 
 
 sre_ibid = sre.compile(r'(-|\b)(IBID\.?( ([A-H]|(I{1,3}V?|VI{0,3})|[1-3]))?)\s?:', sre.UNICODE)
 sre_matched_ibid = sre.compile(r'IBID\.?\s?([A-H]|(I{1,3}V?|VI{0,3})|[1-3])?', sre.UNICODE)
 
 sre_title_series = sre.compile(r'\, +([A-H]|(I{1,3}V?|VI{0,3}))$', sre.UNICODE)
 
 ## After having processed a line for titles, it may be possible to find more numeration with the
 ## aid of the recognised titles. The following 2 patterns are used for this:
 
 sre_correct_numeration_2nd_try_ptn1 = \
     (sre.compile(r'\(?([12]\d{3})([A-Za-z]?)\)?,? *(<cds\.TITLE>(\.|[^<])*<\/cds\.TITLE>),? *(\b[Vv]o?l?\.?)?\s?(\d+)(,\s*|\s+)[pP]?[p]?\.?\s?([RL]?\d+[c]?)\-?[RL]?\d{0,6}[c]?', sre.UNICODE), \
                                         '\\g<1>\\g<2>, \\g<3> \\g<6> (\\g<1>) \\g<8>'
     )
 sre_correct_numeration_2nd_try_ptn2 = \
     (sre.compile(r'\(?([12]\d{3})([A-Za-z]?)\)?,? *(<cds\.TITLE>(\.|[^<])*<\/cds\.TITLE>),? *(\b[Vv]o?l?\.?)?\s?(\d+)\s?([A-H])\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)\-?[RL]?\d{0,6}[c]?', sre.UNICODE), \
                                         '\\g<1>\\g<2>, \\g<3> \\g<6> \\g<7> \\g<8> (\\g<1>)'
     )
 
 ## precompile some regexps used to search for and standardize numeration patterns in a line for the first time:
 
 ## Delete the colon and expressions such as Serie, vol, V. inside the pattern <serie : volume>
 ## E.g.: Replace the string """Series A, Vol 4""" with """A 4"""
 sre_strip_series_and_volume_labels = (sre.compile(r'(Serie\s|\bS\.?\s)?([A-H])\s?[:,]\s?(\b[Vv]o?l?\.?)?\s?(\d+)', sre.UNICODE),
                       unicode('\\g<2> \\g<4>'))
 
 
 ## This pattern is not compiled, but rather included in the other numeration paterns:
 _sre_non_compiled_pattern_nucphysb_subtitle = r'(?:[\(\[]\s*?(?:[Ff][Ss]|[Pp][Mm])\s*?\d{0,4}\s*?[\)\]])?'
 
 ## the 4 main numeration patterns:
 
 
 ## Pattern 0 (was pattern 3): <x, vol, page, year>
 sre_numeration_vol_nucphys_page_yr = (sre.compile(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?[,:\s]\s?' +\
                                                    _sre_non_compiled_pattern_nucphysb_subtitle +\
                                                    r'[,;:\s]?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?,?\s?\(?([1-2]\d\d\d)\)?', \
                                                    sre.UNICODE), \
                                           unicode(' : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<4>)</cds.YR> <cds.PG>\\g<3></cds.PG> '))
 
 sre_numeration_nucphys_vol_page_yr = (sre.compile(r'\b' + _sre_non_compiled_pattern_nucphysb_subtitle +\
      r'[,;:\s]?([Vv]o?l?\.?)?\s?(\d+)\s?[,:\s]\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?,?\s?\(?([1-2]\d\d\d)\)?', sre.UNICODE),\
                       unicode(' : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<4>)</cds.YR> <cds.PG>\\g<3></cds.PG> '))
 
 ## Pattern 1: <x, vol, year, page>
 ## <v, [FS]?, y, p>
 sre_numeration_vol_nucphys_yr_page = (sre.compile(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?' +\
                                  _sre_non_compiled_pattern_nucphysb_subtitle +\
                                  r'[,;:\s]?\(([1-2]\d\d\d)\),?\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?', sre.UNICODE),\
                       unicode(' : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<3>)</cds.YR> <cds.PG>\\g<4></cds.PG> '))
 ## <[FS]?, v, y, p>
 sre_numeration_nucphys_vol_yr_page = (sre.compile(r'\b' + _sre_non_compiled_pattern_nucphysb_subtitle +\
      r'[,;:\s]?([Vv]o?l?\.?)?\s?(\d+)\s?\(([1-2]\d\d\d)\),?\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?', sre.UNICODE),\
                       unicode(' : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<3>)</cds.YR> <cds.PG>\\g<4></cds.PG> '))
 
 
 ## Pattern 2: <vol, serie, year, page>
 ## <v, s, [FS]?, y, p>
 sre_numeration_vol_series_nucphys_yr_page = (sre.compile(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?([A-H])\s?' + _sre_non_compiled_pattern_nucphysb_subtitle +\
                                  r'[,;:\s]?\(([1-2]\d\d\d)\),?\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?', sre.UNICODE),\
                       unicode(' <cds.SER>\\g<3></cds.SER> : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<4>)</cds.YR> <cds.PG>\\g<5></cds.PG> '))
 ## <v, [FS]?, s, y, p
 sre_numeration_vol_nucphys_series_yr_page = (sre.compile(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?' + _sre_non_compiled_pattern_nucphysb_subtitle +\
                       r'[,;:\s]?([A-H])\s?\(([1-2]\d\d\d)\),?\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?', sre.UNICODE),\
                       unicode(' <cds.SER>\\g<3></cds.SER> : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<4>)</cds.YR> <cds.PG>\\g<5></cds.PG> '))
 
 
 
 ## Pattern 4: <vol, serie, page, year>
 ## <v, s, [FS]?, p, y>
 sre_numeration_vol_series_nucphys_page_yr = (sre.compile(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?([A-H])[,:\s]\s?' + _sre_non_compiled_pattern_nucphysb_subtitle +\
                       r'[,;:\s]?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?,?\s?\(([1-2]\d\d\d)\)', sre.UNICODE),\
                       unicode(' <cds.SER>\\g<3></cds.SER> : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<5>)</cds.YR> <cds.PG>\\g<4></cds.PG> '))
 
 ## <v, [FS]?, s, p, y>
 sre_numeration_vol_nucphys_series_page_yr = (sre.compile(r'(\b[Vv]o?l?\.?)?\s?(\d+)\s?' + _sre_non_compiled_pattern_nucphysb_subtitle +\
                       r'[,;:\s]?([A-H])[,:\s]\s?[pP]?[p]?\.?\s?([RL]?\d+[c]?)(?:\-|\255)?[RL]?\d{0,6}[c]?,?\s?\(([1-2]\d\d\d)\)', sre.UNICODE),\
                       unicode(' <cds.SER>\\g<3></cds.SER> : <cds.VOL>\\g<2></cds.VOL> <cds.YR>(\\g<5>)</cds.YR> <cds.PG>\\g<4></cds.PG> '))
 
 
 ## a list of patterns used to try to repair broken URLs within reference lines:
 sre_list_url_repair_patterns = get_url_repair_patterns()
 
 ## a dictionary of undesirable characters and their replacements:
 undesirable_char_replacements = get_bad_char_replacements()
 
 
 
 
 ## General initiation tasks:
 
 def get_recids_and_filepaths(args):
     """from a list of arguments in the form "recid:filepath" (["1:filepath", "2:filepath", [...]])
        split each string into 2 parts: the record ID and the filepath.
        @param args: a list of strings
        @return: a list of tuples: [(recid, filepath)]
     """
     jobs = []
     for x in args:
         items = x.split(":")
         if len(items) != 2:
             sys.stderr.write(u"W: Recid:filepath argument invalid. Skipping.\n")
             continue
         jobs.append((items[0], items[1]))
     return jobs
 
-## components relating to the standardisation and recognition of citations in reference lines:
+## components relating to the standardisation and
+## recognition of citations in reference lines:
 
 def repair_broken_urls(line):
-    """Attempt to repair broken URLs in a line of text. (E.g.: remove spaces from the middle of
-       a URL; something like that.)
+    """Attempt to repair broken URLs in a line of text.
+       (E.g.: remove spaces from the middle of a URL; something like
+       that.)
        @param line: (string) the line in which to check for broken URLs.
-       @return: (string) the line after any broken URLs have (hopefully!) been repaired.
+       @return: (string) the line after any broken URLs have (hopefully!)
+        been repaired.
     """
     def _chop_spaces_in_url_match(m):
         """Suppresses spaces in a matched URL.
         """
         return m.group(1).replace(" ", "")
     for ptn in sre_list_url_repair_patterns:
         line = ptn.sub(_chop_spaces_in_url_match, line)
     return line
 
 def replace_undesirable_characters(line):
     """Replace certain bad characters in a text line.
-       @param line: (string) the text line in which bad characters are to be replaced.
-       @return: (string) the text line after the bad characters have been replaced.
+       @param line: (string) the text line in which bad characters are to
+        be replaced.
+       @return: (string) the text line after the bad characters have been
+        replaced.
     """
     bad_chars = undesirable_char_replacements.keys()
     for bad_char in bad_chars:
         try:
             line = line.replace(bad_char, undesirable_char_replacements[bad_char])
         except UnicodeDecodeError:
             pass
     return line
 
 def remove_and_record_multiple_spaces_in_line(line):
-    """For a given string, locate all ocurrences of multiple spaces together in the line, record the
-       number of spaces found at each position, and replace them with a single space.
+    """For a given string, locate all ocurrences of multiple spaces together
+       in the line, record the number of spaces found at each position, and
+       replace them with a single space.
        @param line: (string) the text line to be processed for multiple spaces.
-       @return: (tuple) countaining a dictionary and a string. The dictionary contains information about
-        the number of spaces removed at given positions in the line. For example, if 3 spaces were removed
-        from the line at index '22', the dictionary would be set as follows: { 22 : 3 }
-        The string that is also returned in this tuple is the line after multiple-space ocurrences have
-        replaced with single spaces.
+       @return: (tuple) countaining a dictionary and a string. The dictionary
+        contains information about the number of spaces removed at given
+        positions in the line. For example, if 3 spaces were removed from the
+        line at index '22', the dictionary would be set as follows: { 22 : 3 }
+        The string that is also returned in this tuple is the line after
+        multiple-space ocurrences have replaced with single spaces.
     """
     removed_spaces = {}
     ## get a collection of match objects for all instances of multiple-spaces found in the line:
     multispace_matches = sre_group_captured_multiple_space.finditer(line)
     ## record the number of spaces found at each match position:
     for multispace in multispace_matches:
         removed_spaces[multispace.start()] = (multispace.end() - multispace.start() - 1)
     ## now remove the multiple-spaces from the line, replacing with a single space at each position:
     line = sre_group_captured_multiple_space.sub(u' ', line)
     return (removed_spaces, line)
 
 def wash_line(line):
-    """Wash a text line of certain punctuation errors, replacing them with more correct
-       alternatives.  E.g.: the string 'Yes , I like python.' will be transformed into
-       'Yes, I like python.'
+    """Wash a text line of certain punctuation errors, replacing them with
+       more correct alternatives.  E.g.: the string 'Yes , I like python.'
+       will be transformed into 'Yes, I like python.'
        @param line: (string) the line to be washed.
        @return: (string) the washed line.
     """
     line = sre_space_comma.sub(',', line)
     line = sre_space_semicolon.sub(';', line)
     line = sre_space_period.sub('.', line)
     line = sre_colon_space_colon.sub(':', line)
     line = sre_comma_space_colon.sub(':', line)
     line = sre_space_closing_square_bracket.sub(']', line)
     line = sre_opening_square_bracket_space.sub('[', line)
     line = sre_hyphens.sub('-', line)
     line = sre_colon_not_followed_by_numeration_tag.sub(' ', line)
     line = sre_multiple_space.sub(' ', line)
     return line
 
 def _order_institute_preprint_reference_numeration_patterns_by_length(numeration_patterns):
     """Given a list of user-defined patterns for recognising the numeration styles of an institute's
        preprint references, for each pattern, strip out character classes and record the length of the pattern.
        Then add the length and the original pattern (in a tuple) into a new list for these patterns and return
        this list.
        @param numeration_patterns: (list) of strings, whereby each string is a numeration pattern.
        @return: (list) of tuples, where each tuple contains a pattern and its length.
     """
     def _compfunc_bylen(a, b):
         """Compares regexp patterns by the length of the pattern-text.
         """
         if a[0] < b[0]:
             return 1
         elif a[0] == b[0]:
             return 0
         else:
             return -1
     pattern_list = []
     for pattern in numeration_patterns:
         base_pattern = sre_regexp_character_class.sub('1', pattern)
         pattern_list.append((len(base_pattern), pattern))
     pattern_list.sort(_compfunc_bylen)
     return pattern_list
 
 def create_institute_numeration_group_regexp_pattern(patterns):
     """Using a list of regexp patterns for recognising numeration patterns for institute preprint references,
        ordered by length - longest to shortest - create a grouped 'OR' or of these patterns, ready to be used
        in a bigger regexp.
        @param patterns: (list) of strings. All of the numeration regexp patterns for recognising an institute's preprint
         reference styles.
        @return: (string) a grouped 'OR' regexp pattern of the numeration patterns. E.g.:
            (?P<num>[12]\d{3} \d\d\d|\d\d \d\d\d|[A-Za-z] \d\d\d)
     """
     grouped_numeration_pattern = u""
     if len(patterns) > 0:
         grouped_numeration_pattern = u"(?P<numn>"
         for pattern in patterns:
             grouped_numeration_pattern += institute_num_pattern_to_regex(pattern[1]) + u"|"
         grouped_numeration_pattern = grouped_numeration_pattern[0:len(grouped_numeration_pattern) - 1]
         grouped_numeration_pattern += u")"
     return grouped_numeration_pattern
 
 def institute_num_pattern_to_regex(pattern):
     """Given a numeration pattern from the institutes preprint report numbers KB,
        convert it to turn it into a regexp string for recognising such patterns in
        a reference line.
        Change:
            \     -> \\
            9     -> \d
            a     -> [A-Za-z]
            mm    -> (0[1-9]|1[0-2])
            yy    -> \d{2}
            yyyy  -> [12]\d{3}
            /     -> \/
            s     -> \s*?
        @param pattern: (string) a user-defined preprint reference numeration pattern.
        @return: (string) the regexp for recognising the pattern.
     """
     simple_replacements = [ ('9',    r'\d'),
                             ('a',    r'[A-Za-z]'),
                             ('mm',   r'(0[1-9]|1[0-2])'),
                             ('yyyy', r'[12]\d{3}'),
                             ('yy',   r'\d\d'),
                             ('s',    r'\s*?'),
                             (r'/',   r'\/')
                           ]
     ## first, escape certain characters that could be sensitive to a regexp:
     pattern = sre_report_num_chars_to_escape.sub(r'\\\g<1>', pattern)
 
     ## now loop through and carry out the simple replacements:
     for repl in simple_replacements:
         pattern = pattern.replace(repl[0], repl[1])
 
     ## now replace a couple of regexp-like paterns:
         ## quoted string with non-quoted version ("hello" with hello);
         ## Replace / [abcd ]/ with /( [abcd])?/ :
     pattern = sre_extract_quoted_text[0].sub(sre_extract_quoted_text[1], pattern)
     pattern = sre_extract_char_class[0].sub(sre_extract_char_class[1], pattern)
 
     ## the pattern has been transformed
     return pattern
 
 def build_institutes_preprints_numeration_knowledge_base(fpath):
     """Given the path to a knowledge base file containing the details of institutes and the patterns
        that their preprint report numberring schemes take, create a dictionary of regexp search patterns
        to recognise these preprint references in reference lines, and a dictionary of replacements for
        non-standard preprint categories in these references.
 
        The knowledge base file should consist only of lines that take one of the following 3 formats:
 
          #####Institute Name####
 
        (the name of the institute to which the preprint reference patterns belong, e.g. '#####LANL#####',
         surrounded by 5 # on either side.)
 
          <pattern>
 
        (numeration patterns for an institute's preprints, surrounded by < and >.)
 
          seek-term       ---   replace-term
        (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with
        the two phrases being separated by 3 hyphens.) E.g.:
          ASTRO PH        ---astro-ph
          
        The left-hand side term is a non-standard version of the preprint reference category; the right-hand
        side term is the standard version.
 
        If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error
        message is output to standard error and execution is halted with an error-code 0.
 
        @param fpath: (string) the path to the knowledge base file.
        @return: (tuple) containing 2 dictionaries. The first contains regexp search patterns used to identify
         preprint references in a line. This dictionary is keyed by a tuple containing the line number of the
         pattern in the KB and the non-standard category string.  E.g.: (3, 'ASTRO PH').
         The second dictionary contains the standardised category string, and is keyed by the non-standard
         category string. E.g.: 'astro-ph'.
     """
     def _add_institute_preprint_patterns(preprint_classifications, preprint_numeration_ptns,\
                                          preprint_reference_search_regexp_patterns, \
                                          standardised_preprint_reference_categories, kb_line_num):
         """For a list of preprint category strings and preprint numeration patterns for a given institute,
            create the regexp patterns for each of the preprint types.  Add the regexp patterns to the dictionary
            of search patterns (preprint_reference_search_regexp_patterns), keyed by the line number of the institute
            in the KB, and the preprint category search string.  Also add the standardised preprint category string
            to another dictionary, keyed by the line number of its position in the KB and its non-standardised
            version.
            @param preprint_classifications: (list) of tuples whereby each tuple contains a preprint category search
             string and the line number of the name of institute to which it belongs in the KB. E.g.: (45, 'ASTRO PH').
            @param preprint_numeration_ptns: (list) of preprint reference numeration search patterns (strings)
            @param preprint_reference_search_regexp_patterns: (dictionary) of regexp patterns used to search in
             document lines.
            @param standardised_preprint_reference_categories: (dictionary) containing the standardised strings for
             preprint reference categories. (E.g. 'astro-ph'.)
            @param kb_line_num: (integer) - the line number int the KB at which a given institute name was found.
            @return: None
         """
         if len(preprint_classifications) > 0 and \
            len(preprint_numeration_ptns) > 0:
             ## the previous institute had both numeration styles and categories for preprint references.
             ## build regexps and add them for this institute:
             ## First, order the numeration styles by line-length, and build a grouped regexp for recognising numeration:
             ordered_patterns = _order_institute_preprint_reference_numeration_patterns_by_length(preprint_numeration_ptns)
             ## create a grouped regexp for numeration part of preprint reference:
             numeration_regexp = create_institute_numeration_group_regexp_pattern(ordered_patterns)
 
             ## for each "classification" part of preprint references, create a complete regex:
             ## will be in the style "(categ)-(numatn1|numatn2|numatn3|...)"
             for classification in preprint_classifications:
                 search_pattern_str = r'[^a-zA-Z0-9\/\.]((?P<categ>' \
                                      + classification[0] + u')' \
                                      + numeration_regexp + r')'
                 sre_search_pattern = sre.compile(search_pattern_str, sre.UNICODE)
                 preprint_reference_search_regexp_patterns[(kb_line_num, classification[0])]  = sre_search_pattern
                 standardised_preprint_reference_categories[(kb_line_num, classification[0])] = classification[1]
 
     preprint_reference_search_regexp_patterns  = {}  ## a dictionary of paterns used to recognise categories of
                                                      ## preprints as used by various institutes
     standardised_preprint_reference_categories = {}  ## dictionary of standardised category strings for preprint cats
     current_institute_preprint_classifications = []  ## list of tuples containing preprint categories in their raw and
                                                      ## standardised forms, as read from the KB
     current_institute_numerations = []               ## list of preprint numeration patterns, as read from the KB
     sre_institute_name          = sre.compile(r'^\#{5}\s*(.+)\s*\#{5}$', sre.UNICODE)  ## pattern to recognise an
                                                                                        ## institute name line in KB
     sre_preprint_classification = sre.compile(r'^\s*(\w.*?)\s*---\s*(\w.*?)\s*$', sre.UNICODE) ## pattern to recognise
                                                                                                ## an institute preprint
                                                                                                ## categ line in KB
     sre_numeration_pattern      = sre.compile(r'^\<(.+)\>$', sre.UNICODE)         ## pattern to recognise a preprint
                                                                                   ## numeration-style line in KB
     kb_line_num = 0    ## when making the dictionary of patterns, which is keyed by the category search string,
                        ## this counter will ensure that patterns in the dictionary are not overwritten if 2
                        ## institutes have the same category styles.
 
     try:
         fh = open(fpath, "r")
         for rawline in fh:
             rawline = rawline.decode("utf-8")
             kb_line_num += 1
             m_institute_name = sre_institute_name.search(rawline)
             if m_institute_name is not None:
                 ## This KB line is the name of an institute
                 ## append the last institute's pattern list to the list of institutes:
                 _add_institute_preprint_patterns(current_institute_preprint_classifications,\
                                                  current_institute_numerations,\
                                                  preprint_reference_search_regexp_patterns, \
                                                  standardised_preprint_reference_categories, kb_line_num)
 
                 ## Now start a new dictionary to contain the search patterns for this institute:
                 current_institute_preprint_classifications = []
                 current_institute_numerations = []
                 ## move on to the next line
                 continue
 
             m_preprint_classification = sre_preprint_classification.search(rawline)
             if m_preprint_classification is not None:
                 ## This KB line contains a preprint classification for the current institute
                 try:
                     current_institute_preprint_classifications.append((m_preprint_classification.group(1), \
                                                                       m_preprint_classification.group(2)))
                 except (AttributeError, NameError):
                     ## didn't match this line correctly - skip it
                     pass
                 ## move on to the next line
                 continue
 
             m_numeration_pattern = sre_numeration_pattern.search(rawline)
             if m_numeration_pattern is not None:
                 ## This KB line contains a preprint item numeration pattern for the current institute
                 try:
                     current_institute_numerations.append(m_numeration_pattern.group(1))
                 except (AttributeError, NameError):
                     ## didn't match the numeration pattern correctly - skip it
                     pass
                 continue
 
         _add_institute_preprint_patterns(current_institute_preprint_classifications,\
                                          current_institute_numerations,\
                                          preprint_reference_search_regexp_patterns, \
                                          standardised_preprint_reference_categories, kb_line_num)
 
     except IOError:
         ## problem opening KB for reading, or problem while reading from it:
         emsg = """Error: Could not build knowledge base containing institute preprint referencing"""\
                """ patterns - failed to read from KB %(kb)s.\n""" \
                % { 'kb' : fpath }
         sys.stderr.write(emsg)
         sys.stderr.flush()
         sys.exit(0)
 
     ## return the preprint reference patterns and the replacement strings for non-standard categ-strings:
     return (preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories)
 
 def build_titles_knowledge_base(fpath):
     """Given the path to a knowledge base file, read in the contents of that file into a dictionary
        of search->replace word phrases. The search phrases are compiled into a regex pattern object.
        The knowledge base file should consist only of lines that take the following format:
          seek-term       ---   replace-term
        (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with
        the two phrases being separated by 3 hyphens.) E.g.:
          ASTRONOMY AND ASTROPHYSICS              ---Astron. Astrophys.
 
        The left-hand side term is a non-standard version of the title, whereas the right-hand side
        term is the standard version.
        If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error
        message is output to standard error and execution is halted with an error-code 0.
 
        @param fpath: (string) the path to the knowledge base file.
        @return: (tuple) containing a list and a dictionary. The list contains compiled regex patterns
         used as search terms and will be used to force searching order to match that of the knowledge
         base.
         The dictionary contains the search->replace terms.  The keys of the dictionary are the compiled
         regex word phrases used for searching in the reference lines; The values in the dictionary are
         the replace terms for matches.
     """
     ## Initialise vars:
     ## dictionary of search and replace phrases from KB:
     kb = {}
     standardised_titles = {}
     seek_phrases = []
     
     ## Pattern to recognise a correct knowledge base line:
     p_kb_line = sre.compile('^\s*(?P<seek>\w.*?)\s*---\s*(?P<repl>\w.*?)\s*$', sre.UNICODE)
 
     try:
         fh = open(fpath, "r")
         count = 0	      
         for rawline in fh:
             count += 1
             ## Test line to ensure that it is a correctly formatted knowledge base line:
             try:
                 m_kb_line = p_kb_line.search(rawline.decode("utf-8").rstrip("\n"))
             except UnicodeError:
                 sys.stderr.write("*** Unicode problems in %s for line %s\n" \
                                  % (fpath, str(count)))
             if m_kb_line is not None:
                 ## good KB line
                 seek_phrase = m_kb_line.group('seek')
                 if len(seek_phrase) > 1:
                     ## add the phrase from the KB if the 'seek' phrase is longer than 1 character:
                     ## compile the seek phrase into a pattern:
                     seek_ptn = sre.compile(r'(?<!\/)\b(' + sre.escape(seek_phrase) + r')[^A-Z0-9]', sre.UNICODE)
                     if not kb.has_key(seek_phrase):
                         kb[seek_phrase] = seek_ptn
                         standardised_titles[seek_phrase] = m_kb_line.group('repl')
                         seek_phrases.append(seek_phrase)
             else:
                 ## KB line was not correctly formatted - die with error
                 emsg = """Error: Could not build list of journal titles - KB %(kb)s has errors.\n""" \
                        % { 'kb' : fpath }
                 sys.stderr.write(emsg)
                 sys.exit(0)
         fh.close()
     except IOError:
         ## problem opening KB for reading, or problem while reading from it:
         emsg = """Error: Could not build list of journal titles - failed to read from KB %(kb)s.\n""" \
                % { 'kb' : fpath }
         sys.stderr.write(emsg)
         sys.stderr.flush()
         sys.exit(0)
 
     ## return the raw knowledge base:
     return (kb, standardised_titles, seek_phrases)
 
-## NICK - 2007/01/11
 def standardize_and_markup_numeration_of_citations_in_line(line):
     """Given a reference line, attepmt to locate instances of citation 'numeration' in the line.
        Upon finding some numeration, re-arrange it into a standard order, and mark it up with tags.
        Will process numeration in the following order:
             Delete the colon and expressions such as Serie, vol, V. inside the pattern <serie : volume>
             E.g.: Replace the string 'Series A, Vol 4' with 'A 4'
             Then, the 4 main numeration patterns:
             Pattern 0 (was pattern 3): <x, vol, page, year>
             <v, [FS]?, p, y>
             <[FS]?, v, p, y>
             Pattern 1: <x, vol, year, page>
             <v, [FS]?, y, p>
             <[FS]?, v, y, p>
             Pattern 2: <vol, serie, year, page>
             <v, s, [FS]?, y, p>
             <v, [FS]?, s, y, p
             Pattern 4: <vol, serie, page, year>
             <v, s, [FS]?, p, y>
             <v, [FS]?, s, p, y>
 
        @param line: (string) the reference line.
        @return: (string) the reference line after numeration has been checked and possibly
         recognized/marked-up.
     """
     line = sre_strip_series_and_volume_labels[0].sub(sre_strip_series_and_volume_labels[1], line)
     line = sre_numeration_vol_nucphys_page_yr[0].sub(sre_numeration_vol_nucphys_page_yr[1], line)
     line = sre_numeration_nucphys_vol_page_yr[0].sub(sre_numeration_nucphys_vol_page_yr[1], line)
     line = sre_numeration_vol_nucphys_yr_page[0].sub(sre_numeration_vol_nucphys_yr_page[1], line)
     line = sre_numeration_nucphys_vol_yr_page[0].sub(sre_numeration_nucphys_vol_yr_page[1], line)
     line = sre_numeration_vol_series_nucphys_yr_page[0].sub(sre_numeration_vol_series_nucphys_yr_page[1], line)
     line = sre_numeration_vol_nucphys_series_yr_page[0].sub(sre_numeration_vol_nucphys_series_yr_page[1], line)
     line = sre_numeration_vol_series_nucphys_page_yr[0].sub(sre_numeration_vol_series_nucphys_page_yr[1], line)
     line = sre_numeration_vol_nucphys_series_page_yr[0].sub(sre_numeration_vol_nucphys_series_page_yr[1], line)
     return line
 
 def identify_preprint_report_numbers(line,
                                      preprint_repnum_search_kb,
                                      preprint_repnum_standardised_categs):
     """Attempt to identify all preprint report numbers in a reference line.
        Report numbers will be identified, their information (location in line, length in line, and
        standardised replacement version) will be record, and they will be replaced in the working-
        line by underscores.
        @param line: (string) - the working reference line.
        @param preprint_repnum_search_kb: (dictionary) - contains the regexp patterns used to identify preprint
         report numbers.
        @param preprint_repnum_standardised_categs: (dictionary) - contains the standardised 'category' of a given
         preprint report number.
        @return: (tuple) - 3 elements:
            * a dictionary containing the lengths in the line of the matched preprint report numbers, keyed by the
              index at which each match was found in the line.
            * a dictionary containing the replacement strings (standardised versions) of preprint report numbers
              that were matched in the line.
            * a string, that is the new version of the working reference line, in which any matched preprint report
              numbers have been replaced by underscores.
         Returned tuple is therefore in the following order:
             (matched-reportnum-lengths, matched-reportnum-replacements, working-line)
     """
     def _by_len(a, b):
         """Comparison function used to sort a list by the length of the strings in
            each element of the list.
         """
         if len(a[1]) < len(b[1]):
             return 1
         elif len(a[1]) == len(b[1]):
             return 0
         else:
             return -1
     repnum_matches_matchlen = {}  ## info about lengths of report numbers matched at given locations in line
     repnum_matches_repl_str = {}  ## standardised report numbers matched at given locations in line
 
     preprint_repnum_categs = preprint_repnum_standardised_categs.keys()
     preprint_repnum_categs.sort(_by_len)
 
     ## try to match preprint report numbers in the line:
     for categ in preprint_repnum_categs:
         ## search for all instances of the current report numbering style in the line:
         repnum_matches_iter = preprint_repnum_search_kb[categ].finditer(line)
         ## for each matched report number of this style:
         for repnum_match in repnum_matches_iter:
             ## Get the matched text for the numeration part of the preprint report number:
             numeration_match = repnum_match.group('numn')
             ## clean/standardise this numeration text:
             numeration_match = numeration_match.replace(" ", "-")
             numeration_match = sre_multiple_hyphens.sub("-", numeration_match)
             numeration_match = numeration_match.replace("/-", "/")
             numeration_match = numeration_match.replace("-/", "/")
             numeration_match = numeration_match.replace("-/-", "/")
             ## replace the found preprint report number in the string with underscores:
             line = line[0:repnum_match.start()] + "_"*len(repnum_match.group(0)) + line[repnum_match.end():]
             ## record the information about the matched preprint report number:
             ## total length in the line of the matched preprint report number:
             repnum_matches_matchlen[repnum_match.start()] = len(repnum_match.group(0))
             ## standardised replacement for the matched preprint report number:
             repnum_matches_repl_str[repnum_match.start()] = preprint_repnum_standardised_categs[categ] + numeration_match
 
     ## return recorded information about matched report numbers, along with the newly changed working line:
     return (repnum_matches_matchlen, repnum_matches_repl_str, line)
 
 def identify_and_tag_URLs(line):
     """Given a reference line, identify URLs in the line and tag them between <cds.URL> tags.
        URLs are identified in 2 forms:
         + Raw: http //cdsware.cern.ch/
         + HTML marked-up: <a href="http //cdsware.cern.ch/">CERN Document Server Software Consortium</a>
        These URLs are considered to have 2 components: The URL itself (url string); and the URL
        description. The description is effectively the text used for the created Hyperlink when the
        URL is marked-up in HTML. When an HTML marked-up URL has been recognised, the text between the
        anchor tags is therefore taken as the URL description. In the case of a raw URL recognition,
        however, the URL itself will also be used as the URL description. For example, in the
        following reference line:
         [1] See <a href="http //cdsware.cern.ch/">CERN Document Server Software Consortium</a>.
        ...the URL string will be "http //cdsware.cern.ch/" and the URL description will be
        "CERN Document Server Software Consortium". The line returned will therefore be:
         [1] See <cds.URL description="http //cdsware.cern.ch/">CERN Document Server Software
         Consortium</cds.URL>.
        In the following line, however:
         [1] See http //cdsware.cern.ch/ for more details.
        ...the URL string will be "http //cdsware.cern.ch/" and the URL description will also be
        "http //cdsware.cern.ch/". The line returned will therefore be:
         [1] See <cds.URL description="http //cdsware.cern.ch/">http //cdsware.cern.ch/</cds.URL>
          for more details.
        Note that URLs recognised may not have the colon separator in the protocol. This is because
        in the step prior to the calling of this function, colons will have been removed from the
        line so that numeration (as found in journal article citations) could be identified and
        tagged.
        @param line: (string) the reference line in which to search for URLs.
        @return: (string) the reference line in which any recognised URLs have been tagged.
     """
     ## Dictionaries to record details of matched URLs:
     found_url_full_matchlen = {}
     found_url_urlstring     = {}
     found_url_urldescr      = {}
 
     ## Attempt to identify and tag all HTML-MARKED-UP URLs in the line:
     m_tagged_url_iter = sre_html_tagged_url.finditer(line)
     for m_tagged_url in m_tagged_url_iter:
         startposn = m_tagged_url.start()       ## start position of matched URL
         endposn   = m_tagged_url.end()         ## end position of matched URL
         matchlen  = len(m_tagged_url.group(0)) ## total length of URL match
         found_url_full_matchlen[startposn] = matchlen
         found_url_urlstring[startposn]     = m_tagged_url.group(3)
         found_url_urldescr[startposn]      = m_tagged_url.group(12)
         ## temporarily replace the URL match with underscores so that it won't be re-found
         line = line[0:startposn] + u"_"*matchlen + line[endposn:]
 
     ## Attempt to identify and tag all RAW (i.e. not HTML-marked-up) URLs in the line:
     m_raw_url_iter = sre_raw_url.finditer(line)
     for m_raw_url in m_raw_url_iter:
         startposn = m_raw_url.start()       ## start position of matched URL
         endposn   = m_raw_url.end()         ## end position of matched URL
         matchlen  = len(m_raw_url.group(0)) ## total length of URL match
         matched_url = m_raw_url.group(1)
         if len(matched_url) > 0 and matched_url[-1] in (".", ","):
             ## Strip the full-stop or comma from the end of the url:
             matched_url = matched_url[:-1]
         found_url_full_matchlen[startposn] = matchlen
         found_url_urlstring[startposn]     = matched_url
         found_url_urldescr[startposn]      = matched_url
         ## temporarily replace the URL match with underscores so that it won't be re-found
         line = line[0:startposn] + u"_"*matchlen + line[endposn:]
 
     ## Now that all URLs have been identified, insert them back into the line, tagged:
     found_url_positions = found_url_urlstring.keys()
     found_url_positions.sort()
     extras_from_previous_url = 0
     for url_position in found_url_positions:
         line = line[0:url_position + extras_from_previous_url] \
                + """<cds.URL description="%(url-description)s">%(url)s</cds.URL>""" \
                % { 'url-description' : found_url_urldescr[url_position],
                    'url'             : found_url_urlstring[url_position],
                  } \
                + line[url_position+found_url_full_matchlen[url_position]+extras_from_previous_url:]
         extras_from_previous_url += len("""<cds.URL description=""></cds.URL>""") \
                                     + len(found_url_urldescr[url_position])
 
     ## return the line containing the tagged URLs:
     return line
 
 def identify_periodical_titles(line, periodical_title_search_kb, periodical_title_search_keys):
     """Attempt to identify all periodical titles in a reference line.
        Titles will be identified, their information (location in line, length in line, and non-
        standardised version) will be record, and they will be replaced in the working line by
        underscores.
        @param line: (string) - the working reference line.
        @param periodical_title_search_kb: (dictionary) - contains the regexp patterns used to
         search for a non-standard TITLE in the working reference line. Keyed by the TITLE string
         itself.
        @param periodical_title_search_keys: (list) - contains the non-standard periodical TITLEs
         to be searched for in the line. This list of titles has already been ordered and is used
         to force the order of searching.
        @return: (tuple) containing 3 elements:
                         + (dictionary) - the lengths of all titles matched at each given index
                                          within the line.
                         + (dictionary) - the text actually matched for each title at each given
                                          index within the line.
                         + (string)     - the working line, with the titles removed from it and
                                          replaced by underscores.
     """
     title_matches_matchlen  = {}  ## info about lengths of periodical titles matched at given locations in the line
     title_matches_matchtext = {}  ## the text matched at the given line location (i.e. the title itself)
 
     ## Split the line into segments based on "</CDS PG>" ocurrences. Since the name of a periodical should
     ## come before the numeration, "</CDS PG>" should mark the end of the recognised numeration and should be the
     ## splitting point.
     ## By splitting the line into segments that each contain only one instance of numeration, it can be said that at most,
     ## there can only be one "meaningful" (one that can be linked to the numeration information) periodical in that segment.
     ## This means that after identifying a title that is next to the numeration in the line, there should be no others in
     ## the line and it should be possible to stop searching in the line for other titles.
     line_segments = map(lambda x: ((x.find("<CDS PG>") != -1) and (x + "</CDS PG>") or (x)), line.split("</CDS PG>"))
     if line_segments[len(line_segments) - 1] == "":
         ## if the last element in the list of line segments is empty, drop it:
         line_segments = line_segments[0:len(line_segments) - 1]
 
     num_segments = len(line_segments)
     len_previous_segments = 0  ## the combined length of previous line segments. Used to determine correct position
                                ## in the line of a matched title, when dealing with line segments.
 
     ## Begin searching:
     ## for each line segment:
     for i in xrange(0, num_segments):
         if line_segments[i].find("<CDS ") == -1:
             ## no recognised numeration in this line - don't bother to search for titles as they will be useless:
             continue
         segment_match = 0  ## reset the segment-match flag as we start to check for titles in a new segment
         for title in periodical_title_search_keys:
             if segment_match != 0:
                 ## a usable title match has been found in the current line-segment - discontinue testing for
                 ## titles in this segment:
                 break
             ## search for all instances of the current periodical title in the current line-segment:
             title_matches_iter = periodical_title_search_kb[title].finditer(line_segments[i])
             
             ## for each matched periodical title:
             for title_match in title_matches_iter:
                 ## record the details of this title match:
                 ## record the match length:
                 title_matches_matchlen[len_previous_segments + title_match.start()] = len(title_match.group(0)) - 1
                 ## record the matched non-standard version of the title:
                 title_matches_matchtext[len_previous_segments + title_match.start()] = title
                 ## replace the matched title text in the line it n * '-', where n is the length of the matched title:
                 line_segments[i] = line_segments[i][0:title_match.start(1)] + "_"*len(title_match.group(1)) \
                                    + line_segments[i][title_match.end(1):]
 
                 ## is this match next to the numeration tags? If yes, drop out of loop:
                 if sre_tagged_numeration_near_line_start.match(line_segments[i][title_match.end():]) is not None:
                     ## Found a good match - drop out of this loop:
                     segment_match = 1
                     break
         ## add the length of this segment to the combined length of previous segments:
         len_previous_segments += len(line_segments[i])
 
     ## rebuild a complete line from the segments:
     processed_line = "".join(line_segments)
 
     ## return recorded information about matched periodical titles,
     ## along with the newly changed working line:
     return (title_matches_matchlen, title_matches_matchtext, processed_line)
 
 def identify_ibids(line):
     """Find IBIDs within the line, record their position and length, and replace them with underscores.
        @param line: (string) the working reference line
        @return: (tuple) containing 2 dictionaries and a string:
          Dictionary 1: matched IBID lengths (Key: position of IBID in line; Value: length of matched IBID)
          Dictionary 2: matched IBID text: (Key: position of IBID in line; Value: matched IBID text)
          String:       working line with matched IBIDs removed
     """
     ibid_match_len = {}
     ibid_match_txt = {}
     ibid_matches_iter = sre_ibid.finditer(line)
     ## Record details of each matched ibid:
     for m_ibid in ibid_matches_iter:
         ibid_match_len[m_ibid.start()] = len(m_ibid.group(2))
         ibid_match_txt[m_ibid.start()] = m_ibid.group(2)
         ## Replace matched text in line with underscores:
         line = line[0:m_ibid.start(2)] + "_"*len(m_ibid.group(2)) + line[m_ibid.end(2):]
     return (ibid_match_len, ibid_match_txt, line)
 
 def get_replacement_types(titles, reportnumbers):
     """Given the indices of the titles and reportnumbers that have been recognised within
        a reference line, create a dictionary keyed by the replacement position in the line,
        where the value for each key is a string describing the type of item replaced at that
        position in the line.
        The description strings are:
            'title'        - indicating that the replacement is a periodical title
            'reportnumber' - indicating that the replacement is a preprint report number.
        @param titles: (list) of locations in the string at which periodical titles were found.
        @param reportnumbers: (list) of locations in the string at which reportnumbers were found.
        @return: (dictionary) of replacement types at various locations within the string.
     """
     rep_types = {}
     for item_idx in titles:
         rep_types[item_idx] = "title"
     for item_idx in reportnumbers:
         rep_types[item_idx] = "reportnumber"
     return rep_types
 
 def account_for_stripped_whitespace(spaces_keys,
                                     removed_spaces,
                                     replacement_types,
                                     len_reportnums,
                                     len_titles,
                                     replacement_index):
     """To build a processed (MARC XML) reference line in which the recognised citations such
        as standardised periodical TITLEs and REPORT-NUMBERs have been marked up, it is necessary
        to read from the reference line BEFORE all punctuation was stripped and it was made into
        upper-case. The indices of the cited items in this 'original line', however, will be
        different to those in the 'working-line', in which punctuation and multiple-spaces were
        stripped out. For example, the following reading-line:
 
         [26] E. Witten and S.-T. Yau, hep-th/9910245.
        ...becomes (after punctuation and multiple white-space stripping):
         [26] E WITTEN AND S T YAU HEP TH/9910245
 
        It can be seen that the report-number citation (hep-th/9910245) is at a different index
        in the two strings. When refextract searches for this citation, it uses the 2nd string
        (i.e. that which is capitalised and has no punctuation). When it builds the MARC XML
        representation of the reference line, however, it needs to read from the first string.
        It must therefore consider the whitespace, punctuation, etc that has been removed, in
        order to get the correct index for the cited item. This function accounts for the stripped
        characters before a given TITLE or REPORT-NUMBER index.
        @param spaces_keys: (list) - the indices at which spaces were removed from the
         reference line.
        @param removed_spaces: (dictionary) - keyed by the indices at which spaces were removed
         from the line, the values are the number of spaces actually removed from that position.
         So, for example, "3 spaces were removed from position 25 in the line."
        @param replacement_types: (dictionary) - at each 'replacement_index' in the line, the
         of replacement to make (title or reportnumber).
        @param len_reportnums: (dictionary) - the lengths of the REPORT-NUMBERs matched at the
         various indices in the line.
        @param len_titles: (dictionary) - the lengths of the various TITLEs matched at the
         various indices in the line.
        @param replacement_index: (integer) - the index in the working line of the identified
         TITLE or REPORT-NUMBER citation.
        @return: (tuple) containing 2 elements:
                         + the true replacement index of a replacement in the reading line;
                         + any extras to add into the replacement index;
     """
     extras = 0
     true_replacement_index = replacement_index
     spare_replacement_index = replacement_index
 
     for space in spaces_keys:
         if space < true_replacement_index:
             ## There were spaces stripped before the current replacement - add the number of spaces removed from
             ## this location to the current replacement index:
             true_replacement_index  += removed_spaces[space]
             spare_replacement_index += removed_spaces[space]
         elif (space >= spare_replacement_index) and (replacement_types[replacement_index] == u"title") \
              and (space < (spare_replacement_index + len_titles[replacement_index])):
             ## A periodical title is being replaced. Account for multi-spaces that may have been stripped
             ## from the title before its recognition:
             spare_replacement_index += removed_spaces[space]
             extras += removed_spaces[space]
         elif (space >= spare_replacement_index) and (replacement_types[replacement_index] == u"reportnumber") \
              and (space < (spare_replacement_index + len_reportnums[replacement_index])):
             ## An institutional preprint report-number is being replaced. Account for multi-spaces that may
             ## have been stripped from it before its recognition:
             spare_replacement_index += removed_spaces[space]
             extras += removed_spaces[space]
 
     ## return the new values for replacement indices with stripped whitespace accounted for:
     return (true_replacement_index, extras)
 
 
 def create_marc_xml_reference_line(working_line,
                                    found_title_len,
                                    found_title_matchtext,
                                    pprint_repnum_len,
                                    pprint_repnum_matchtext,
                                    removed_spaces,
                                    standardised_titles):
     """After the phase of identifying and tagging citation instances in a reference line,
        this function is called to go through the line and the collected information about
        the recognised citations, and to transform the line into a string of MARC XML in
        which the recognised citations are grouped under various datafields and subfields,
        depending upon their type.
        @param working_line: (string) - the is the line before the punctuation was stripped.
         At this stage, it has not been capitalised, and neither TITLES nor REPORT NUMBERS
         have been stripped from it. However, any recognised numeration and/or URLs have
         been tagged with <cds.YYYY> tags.
         The working_line could, for example, look something like this:
          [1] CDS <cds.URL description="http //cdsware.cern.ch/">http //cdsware.cern.ch/</cds.URL>.
        @param found_title_len: (dictionary) - the lengths of the title citations that have
         been recognised in the line. Keyed by the index within the line of each match.
        @param found_title_matchtext: (dictionary) - The text that was found for each matched
         title citation in the line. Keyed by the index within the line of each match.
        @param pprint_repnum_len: (dictionary) - the lengths of the matched institutional
         preprint report number citations found within the line. Keyed by the index within
         the line of each match.
        @param pprint_repnum_matchtext: (dictionary) - The matched text for each matched
         institutional report number. Keyed by the index within the line of each match.
        @param removed_spaces: (dictionary) - The number of spaces removed from the various
         positions in the line. Keyed by the index of the position within the line at which
         the spaces were removed.
        @param standardised_titles: (dictionary) - The standardised journal titles, keyed
         by the non-standard version of those titles.
        @return: (tuple) of 5 components:
                   ( string  -> a MARC XML-ized reference line.
                     integer -> number of fields of miscellaneous text marked-up for the line.
                     integer -> number of title citations marked-up for the line.
                     integer -> number of institutional report-number citations marked-up
                      for the line.
                     integer -> number of URL citations marked-up for the record.
                   )
 
     """
     if len(found_title_len) + len(pprint_repnum_len) == 0:
         ## no TITLE or REPORT-NUMBER citations were found within this line, use the raw line:
         ## (This 'raw' line could still be tagged with recognised URLs or numeration.)
         tagged_line = working_line
     else:
         ## TITLE and/or REPORT-NUMBER citations were found in this line, build a new
         ## version of the working-line in which the standard versions of the REPORT-NUMBERs
         ## and TITLEs are tagged:
         startpos = 0          ## First cell of the reference line...
         previous_match = u""  ## previously matched TITLE within line (used for replacement
                               ## of IBIDs.
         replacement_types = {}
         title_keys = found_title_matchtext.keys()
         title_keys.sort()
         pprint_keys = pprint_repnum_matchtext.keys()
         pprint_keys.sort()
         spaces_keys = removed_spaces.keys()
         spaces_keys.sort()
         replacement_types = get_replacement_types(title_keys, pprint_keys)
         replacement_locations = replacement_types.keys()
         replacement_locations.sort()
 
         tagged_line = u"" ## This is to be the new 'working-line'. It will contain the
                           ## tagged TITLEs and REPORT-NUMBERs, as well as any previously
                           ## tagged URLs and numeration components.
         ## begin:
         for replacement_index in replacement_locations:
             ## first, factor in any stripped spaces before this 'replacement'
             (true_replacement_index, extras) = \
                   account_for_stripped_whitespace(spaces_keys,
                                                   removed_spaces,
                                                   replacement_types,
                                                   pprint_repnum_len,
                                                   found_title_len,
                                                   replacement_index)
 
             if replacement_types[replacement_index] == u"title":
                 ## Add a tagged periodical TITLE into the line:
                 (rebuilt_chunk, startpos, previous_match) = \
                       add_tagged_title(reading_line=working_line,
                                        len_title=found_title_len[replacement_index],
                                        matched_title=found_title_matchtext[replacement_index],
                                        previous_match=previous_match,
                                        startpos=startpos,
                                        true_replacement_index=true_replacement_index,
                                        extras=extras,
                                        standardised_titles=standardised_titles)
                 tagged_line += rebuilt_chunk
 
             elif replacement_types[replacement_index] == u"reportnumber":
                 ## Add a tagged institutional preprint REPORT-NUMBER into the line:
                 (rebuilt_chunk, startpos) = \
                       add_tagged_report_number(reading_line=working_line,
                                                len_reportnum=pprint_repnum_len[replacement_index],
                                                reportnum=pprint_repnum_matchtext[replacement_index],
                                                startpos=startpos,
                                                true_replacement_index=true_replacement_index,
                                                extras=extras)
                 tagged_line += rebuilt_chunk
 
         ## add the remainder of the original working-line into the rebuilt line:
         tagged_line += working_line[startpos:]
         ## use the recently marked-up title information to identify any numeration that escaped the last pass:
         tagged_line = _re_identify_numeration(tagged_line)
         ## remove any series tags that are next to title tags, putting series information into the title tags:
         tagged_line = move_tagged_series_into_tagged_title(tagged_line)
         tagged_line = wash_line(tagged_line)
 
     ## Now, from the tagged line, create a MARC XML string, marking up any recognised citations:
     (xml_line, count_misc, count_title, count_reportnum, count_url) = \
                convert_processed_reference_line_to_marc_xml(tagged_line)
     return (xml_line, count_misc, count_title, count_reportnum, count_url)
 
 def _refextract_markup_title_as_marcxml(title, volume, year, page, misc_text=""):
     """Given a title, its numeration and some optional miscellaneous text, return a
        string containing the MARC XML version of this information. E.g. for the
        miscellaneous text "S. D. Hsu and M. Schwetz ", the title "Nucl. Phys., B",
        the volume "572", the year "2000" and the page number "211" return the following
        MARC XML string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="m">S. D. Hsu and M. Schwetz </subfield>
            <subfield code="s">Nucl. Phys., B 572 (2000) 211</subfield>
         </datafield>
        In the event that the miscellaneous text string is zero-length, there will be
        no $m subfield present in the returned XML.
        @param title: (string) - the cited title.
        @param volume: (string) - the volume of the cited title.
        @param year: (string) - the year of the cited title.
        @param page: (string) - the page of the cited title.
        @param misc_text: (string) - the miscellaneous text to be marked up.
        @return: (string) MARC XML representation of the cited title and its miscellaneous
         text.
     """
     ## First, determine whether there is need of a misc subfield:
     if len(misc_text) > 0:
         ## create a misc subfield to be included in the MARC XML:
         xml_misc_subfield = """
       <subfield code="%(sf-code-ref-misc)s">%(misc-val)s</subfield>""" \
                 % { 'sf-code-ref-misc'       : CFG_REFEXTRACT_SUBFIELD_MISC,
                     'misc-val'               : encode_for_xml(misc_text),
                   }
     else:
         ## the misc subfield is not needed
         xml_misc_subfield = ""
     ## Build the datafield for the report number segment of the reference line:
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(misc-subfield)s
       <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>
    </datafield>
 """               % { 'df-tag-ref'             : CFG_REFEXTRACT_TAG_ID_REFERENCE,
                       'df-ind1-ref'            : CFG_REFEXTRACT_IND1_REFERENCE,
                       'df-ind2-ref'            : CFG_REFEXTRACT_IND2_REFERENCE,
                       'sf-code-ref-title'      : CFG_REFEXTRACT_SUBFIELD_TITLE,
                       'misc-subfield'          : xml_misc_subfield,
                       'title'                  : encode_for_xml(title),
                       'volume'                 : encode_for_xml(volume),
                       'year'                   : encode_for_xml(year),
                       'page'                   : encode_for_xml(page),
                     }
     return xml_line
 
 def _refextract_markup_title_followed_by_report_number_as_marcxml(title, volume, year, page,
                                                                   report_number, misc_text=""):
     """Given a title (and its numeration), a report number, and some optional
        miscellaneous text, return a string containing the MARC XML version of this
        information. E.g. for the miscellaneous text "S. D. Hsu and M. Schwetz ",
        the report number "hep-th/1111111", the title "Nucl. Phys., B", the volume "572",
        the year "2000", and the page number "211", return the following
        MARC XML string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="m">S. D. Hsu and M. Schwetz </subfield>
            <subfield code="r">hep-th/1111111</subfield>
            <subfield code="s">Nucl. Phys., B 572 (2000) 211</subfield>
         </datafield>
        In the event that the miscellaneous text string is zero-length, there will be
        no $m subfield present in the returned XML.
        @param title: (string) - the cited title.
        @param volume: (string) - the volume of the cited title.
        @param year: (string) - the year of the cited title.
        @param page: (string) - the page of the cited title.
        @param report_number: (string) - the institutional report number to be marked up.
        @param misc_text: (string) - the miscellaneous text to be marked up.
        @return: (string) MARC XML representation of the cited title and its miscellaneous
         text.
     """
     ## First, determine whether there is need of a misc subfield:
     if len(misc_text) > 0:
         ## create a misc subfield to be included in the MARC XML:
         xml_misc_subfield = """
       <subfield code="%(sf-code-ref-misc)s">%(misc-val)s</subfield>""" \
                 % { 'sf-code-ref-misc'       : CFG_REFEXTRACT_SUBFIELD_MISC,
                     'misc-val'               : encode_for_xml(misc_text),
                   }
     else:
         ## the misc subfield is not needed
         xml_misc_subfield = ""
     ## Build the datafield for the report number segment of the reference line:
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(misc-subfield)s
       <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>
       <subfield code="%(sf-code-ref-report-num)s">%(report-number)s</subfield>
    </datafield>
 """               % { 'df-tag-ref'             : CFG_REFEXTRACT_TAG_ID_REFERENCE,
                       'df-ind1-ref'            : CFG_REFEXTRACT_IND1_REFERENCE,
                       'df-ind2-ref'            : CFG_REFEXTRACT_IND2_REFERENCE,
                       'sf-code-ref-title'      : CFG_REFEXTRACT_SUBFIELD_TITLE,
                       'sf-code-ref-report-num' : CFG_REFEXTRACT_SUBFIELD_REPORT_NUM,
                       'misc-subfield'          : xml_misc_subfield,
                       'title'                  : encode_for_xml(title),
                       'volume'                 : encode_for_xml(volume),
                       'year'                   : encode_for_xml(year),
                       'page'                   : encode_for_xml(page),
                       'report-number'          : encode_for_xml(report_number),
                     }
     return xml_line
 
 def _refextract_markup_report_number_followed_by_title_as_marcxml(title, volume, year, page,
                                                                   report_number, misc_text=""):
     """Given a title (and its numeration), a report number, and some optional
        miscellaneous text, return a string containing the MARC XML version of this
        information. E.g. for the miscellaneous text "S. D. Hsu and M. Schwetz ",
        the title "Nucl. Phys., B", the volume "572", the year "2000", the page
        number "211", and the report number "hep-th/1111111", return the following
        MARC XML string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="m">S. D. Hsu and M. Schwetz </subfield>
            <subfield code="s">Nucl. Phys., B 572 (2000) 211</subfield>
            <subfield code="r">hep-th/1111111</subfield>
         </datafield>
        In the event that the miscellaneous text string is zero-length, there will be
        no $m subfield present in the returned XML.
        @param title: (string) - the cited title.
        @param volume: (string) - the volume of the cited title.
        @param year: (string) - the year of the cited title.
        @param page: (string) - the page of the cited title.
        @param report_number: (string) - the institutional report number to be marked up.
        @param misc_text: (string) - the miscellaneous text to be marked up.
        @return: (string) MARC XML representation of the cited title and its miscellaneous
         text.
     """
     ## First, determine whether there is need of a misc subfield:
     if len(misc_text) > 0:
         ## create a misc subfield to be included in the MARC XML:
         xml_misc_subfield = """
       <subfield code="%(sf-code-ref-misc)s">%(misc-val)s</subfield>""" \
                 % { 'sf-code-ref-misc'       : CFG_REFEXTRACT_SUBFIELD_MISC,
                     'misc-val'               : encode_for_xml(misc_text),
                   }
     else:
         ## the misc subfield is not needed
         xml_misc_subfield = ""
     ## Build the datafield for the report number segment of the reference line:
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(misc-subfield)s
       <subfield code="%(sf-code-ref-report-num)s">%(report-number)s</subfield>
       <subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>
    </datafield>
 """               % { 'df-tag-ref'             : CFG_REFEXTRACT_TAG_ID_REFERENCE,
                       'df-ind1-ref'            : CFG_REFEXTRACT_IND1_REFERENCE,
                       'df-ind2-ref'            : CFG_REFEXTRACT_IND2_REFERENCE,
                       'sf-code-ref-title'      : CFG_REFEXTRACT_SUBFIELD_TITLE,
                       'sf-code-ref-report-num' : CFG_REFEXTRACT_SUBFIELD_REPORT_NUM,
                       'misc-subfield'          : xml_misc_subfield,
                       'title'                  : encode_for_xml(title),
                       'volume'                 : encode_for_xml(volume),
                       'year'                   : encode_for_xml(year),
                       'page'                   : encode_for_xml(page),
                       'report-number'          : encode_for_xml(report_number),
                     }
     return xml_line
 
 def _refextract_markup_reportnumber_as_marcxml(report_number, misc_text=""):
     """Given a report number and some optional miscellaneous text, return a string
        containing the MARC XML version of this information. E.g. for the miscellaneous
        text "Example, AN " and the institutional report number "hep-th/1111111", return
        the following MARC XML string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="m">Example, AN </subfield>
            <subfield code="r">hep-th/1111111</subfield>
         </datafield>
        In the event that the miscellaneous text string is zero-length, there will be
        no $m subfield present in the returned XML.
        @param report_number: (string) - the institutional report number to be marked up.
        @param misc_text: (string) - the miscellaneous text to be marked up.
        @return: (string) MARC XML representation of the report number and its miscellaneous
         text.
     """
     ## First, determine whether there is need of a misc subfield:
     if len(misc_text) > 0:
         ## create a misc subfield to be included in the MARC XML:
         xml_misc_subfield = """
       <subfield code="%(sf-code-ref-misc)s">%(misc-val)s</subfield>""" \
                 % { 'sf-code-ref-misc'       : CFG_REFEXTRACT_SUBFIELD_MISC,
                     'misc-val'               : encode_for_xml(misc_text),
                   }
     else:
         ## the misc subfield is not needed
         xml_misc_subfield = ""
     ## Build the datafield for the report number segment of the reference line:
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(misc-subfield)s
       <subfield code="%(sf-code-ref-report-num)s">%(report-number)s</subfield>
    </datafield>
 """               % { 'df-tag-ref'             : CFG_REFEXTRACT_TAG_ID_REFERENCE,
                       'df-ind1-ref'            : CFG_REFEXTRACT_IND1_REFERENCE,
                       'df-ind2-ref'            : CFG_REFEXTRACT_IND2_REFERENCE,
                       'sf-code-ref-report-num' : CFG_REFEXTRACT_SUBFIELD_REPORT_NUM,
                       'misc-subfield'          : xml_misc_subfield,
                       'report-number'          : encode_for_xml(report_number),
                     }
     return xml_line
 
 def _refextract_markup_url_as_marcxml(url_string, url_description, misc_text=""):
     """Given a URL, a URL description, and some optional miscellaneous text, return a string
        containing the MARC XML version of this information. E.g. for the miscellaneous
        text "Example, AN ", the URL "http://cdsweb.cern.ch/", and the URL description
        "CERN Document Server", return the following MARC XML string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="m">Example, AN </subfield>
            <subfield code="u">http://cdsweb.cern.ch/</subfield>
            <subfield code="z">CERN Document Server</subfield>
         </datafield>
        In the event that the miscellaneous text string is zero-length, there will be
        no $m subfield present in the returned XML.
        @param url_string: (string) - the URL to be marked up.
        @param url_description: (string) - the description of the URL to be marked up.
        @param misc_text: (string) - the miscellaneous text to be marked up.
        @return: (string) MARC XML representation of the URL, its description, and its
         miscellaneous text.
     """
     ## First, determine whether there is need of a misc subfield:
     if len(misc_text) > 0:
         ## create a misc subfield to be included in the MARC XML:
         xml_misc_subfield = """
       <subfield code="%(sf-code-ref-misc)s">%(misc-val)s</subfield>""" \
                 % { 'sf-code-ref-misc'       : CFG_REFEXTRACT_SUBFIELD_MISC,
                     'misc-val'               : encode_for_xml(misc_text),
                   }
     else:
         ## the misc subfield is not needed
         xml_misc_subfield = ""
     ## Build the datafield for the URL segment of the reference line:
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(misc-subfield)s
       <subfield code="%(sf-code-ref-url)s">%(url)s</subfield>
       <subfield code="%(sf-code-ref-url-descr)s">%(url-descr)s</subfield>
    </datafield>
 """               % { 'df-tag-ref'             : CFG_REFEXTRACT_TAG_ID_REFERENCE,
                       'df-ind1-ref'            : CFG_REFEXTRACT_IND1_REFERENCE,
                       'df-ind2-ref'            : CFG_REFEXTRACT_IND2_REFERENCE,
                       'sf-code-ref-url'        : CFG_REFEXTRACT_SUBFIELD_URL,
                       'sf-code-ref-url-descr'  : CFG_REFEXTRACT_SUBFIELD_URL_DESCR,
                       'misc-subfield'          : xml_misc_subfield,
                       'url'                    : encode_for_xml(url_string),
                       'url-descr'              : encode_for_xml(url_description),
                     }
     return xml_line
 
 def _refextract_markup_reference_line_marker_as_marcxml(marker_text):
     """Given a reference line marker, return a string containing the MARC XML version of
        the marker. E.g. for the line marker "[1]", return the following xml string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="o">[1]</subfield>
         </datafield>
        @param marker_text: (string) the reference line marker to be marked up as MARC XML
        @return: (string) MARC XML representation of the marker line.
     """
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">
       <subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>
    </datafield>
 """ % { 'df-tag-ref'         : CFG_REFEXTRACT_TAG_ID_REFERENCE,
         'df-ind1-ref'        : CFG_REFEXTRACT_IND1_REFERENCE,
         'df-ind2-ref'        : CFG_REFEXTRACT_IND2_REFERENCE,
         'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER,
         'marker-val'         : encode_for_xml(marker_text),
       }
     return xml_line
 
 def _refextract_markup_miscellaneous_text_as_marcxml(misc_text):
     """Given some miscellaneous text, return a string containing the MARC XML version of
        the string. E.g. for the misc_text string "testing", return the following xml string:
         <datafield tag="999" ind1="C" ind2="5">
            <subfield code="m">testing</subfield>
         </datafield>
        @param misc_text: (string) the miscellaneous text to be marked up as MARC XML
        @return: (string) MARC XML representation of the miscellaneous text.
     """
     xml_line = \
 """   <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">
       <subfield code="%(sf-code-ref-misc)s">%(misc-val)s</subfield>
    </datafield>
 """           % { 'df-tag-ref'             : CFG_REFEXTRACT_TAG_ID_REFERENCE,
                   'df-ind1-ref'            : CFG_REFEXTRACT_IND1_REFERENCE,
                   'df-ind2-ref'            : CFG_REFEXTRACT_IND2_REFERENCE,
                   'sf-code-ref-misc'       : CFG_REFEXTRACT_SUBFIELD_MISC,
                   'misc-val'               : encode_for_xml(misc_text),
                 }
     return xml_line
 
 def _convert_unusable_tag_to_misc(line, misc_text, tag_match_start, tag_match_end, closing_tag):
     """Function to remove an unwanted, tagged, citation item from a reference line. Everything
        prior to the opening tag, as well as the tagged item itself, is put into the miscellaneous
        text variable; the data up to the closing tag is then trimmed from the beginning of the
        working line. For example, the following working line:
          Example, AN. Testing software; <cds.YR>(2001)</cds.YR>, CERN, Geneva.
        ...would be trimmed down to:
          , CERN, Geneva.
        ...And the Miscellaneous text taken from the start of the line would be:
          Example, AN. Testing software; (2001)
        ...(assuming that the details of <cds.YR> and </cds.YR> were passed to the function).
        @param line: (string) - the reference line.
        @param misc_text: (string) - the variable containing the miscellaneous text recorded so far.
        @param tag_match_start: (integer) - the index of the start of the opening tag in the line.
        @param tag_match_end: (integer) - the index of the end of the opening tag in the line.
        @param closing_tag: (string) - the closing tag to look for in the line (e.g. </cds.YR>).
        @return: (tuple) - containing misc_text (string) and line (string)
     """
     misc_text += line[0:tag_match_start]
     ## extract the tagged information:
     idx_closing_tag = line.find(closing_tag, tag_match_end)
     ## Sanity check - did we find a closing tag?
     if idx_closing_tag == -1:
         ## no closing tag found - strip the opening tag and move past this
         ## recognised item as it is unusable:
         line = line[tag_match_end:]
     else:
         ## closing tag was found
         misc_text += line[tag_match_end:idx_closing_tag]
         ## now trim the matched item and its tags from the start of the line:
         line = line[idx_closing_tag+len(closing_tag):]
     return (misc_text, line)
 
 def convert_processed_reference_line_to_marc_xml(line):
     """Given a processed reference line, convert it to MARC XML.
        @param line: (string) - the processed reference line, in which
         the recognised citations have been tagged.
        @return: (tuple) -
           + xml_line (string) - the reference line with all of its
             identified citations marked up into the various subfields.
           + count_misc (integer) - number of sections of miscellaneous
              found in the line
           + count_title (integer) - number of title-citations found in
             the line
           + count_reportnum (integer) - number of report numbers found
             in the line
           + count_url (integer) - number of URLs found in the line
     """
     count_misc = count_title = count_reportnum = count_url = 0
     xml_line = ""
     previously_cited_item = None
     processed_line = line.lstrip()
 
     ## 1. Extract reference line marker (e.g. [1]) from start of line and tag it:
     ## get patterns to identify numeration markers at the start of lines:
     marker_patterns = get_reference_line_numeration_marker_patterns()
     marker_match = perform_regex_match_upon_line_with_pattern_list(processed_line, marker_patterns)
 
     if marker_match is not None:
         ## found a marker:
         marker_val = marker_match.group(u'mark')
         ## trim the marker from the start of the line:
         processed_line = processed_line[marker_match.end():].lstrip()
     else:
         marker_val = u" "
 
     ## Now display the marker in marked-up XML:
     xml_line += _refextract_markup_reference_line_marker_as_marcxml(marker_val)
 
     ## 2. Loop through remaining identified segments in line and tag them into MARC XML segments:
     cur_misc_txt = u""  ## a marker to hold gathered miscellaneous text before a citation
     tag_match = sre_tagged_citation.search(processed_line)
     while tag_match is not None:
         ## found a tag - process it:
         tag_match_start = tag_match.start()
         tag_match_end   = tag_match.end()
         tag_type        = tag_match.group(1)
 
         if tag_type == "TITLE":
             ## This tag is an identified journal TITLE. It should be followed by VOLUME,
             ## YEAR and PAGE tags.
             cur_misc_txt += processed_line[0:tag_match_start]
             ## extract the title from the line:
             idx_closing_tag = processed_line.find(CFG_REFEXTRACT_MARKER_CLOSING_TITLE, tag_match_end)
             ## Sanity check - did we find a closing TITLE tag?
             if idx_closing_tag == -1:
                 ## no closing </cds.TITLE> tag found - strip the opening tag and move past it
                 processed_line = processed_line[tag_match_end:]
             else:
                 ## Closing tag was found:
                 title_text  = processed_line[tag_match_end:idx_closing_tag]
                 ## Now trim this matched title and its tags from the start of the line:
                 processed_line = processed_line[idx_closing_tag+len(CFG_REFEXTRACT_MARKER_CLOSING_TITLE):]
 
                 ## Was this title followed by the tags of recognised VOLUME, YEAR and PAGE objects?
                 numeration_match = sre_recognised_numeration_for_title.match(processed_line)
                 if numeration_match is not None:
                     ## recognised numeration immediately after the title - extract it:
                     reference_volume = numeration_match.group(2)
                     reference_year   = numeration_match.group(3)
                     reference_page   = numeration_match.group(4)
                     ## Skip past the matched numeration in the working line:
                     processed_line = processed_line[numeration_match.end():]
 
                     if previously_cited_item is None:
                         ## There is no previously cited item - this should be added as the previously
                         ## cited item:
                         previously_cited_item = { 'type'       : "TITLE",
                                                   'misc_txt'   : cur_misc_txt,
                                                   'title'      : title_text,
                                                   'volume'     : reference_volume,
                                                   'year'       : reference_year,
                                                   'page'       : reference_page,
                                                 }
                         ## Now empty the miscellaneous text and title components:
                         cur_misc_txt = ""
                         title_text = ""
                         reference_volume = ""
                         reference_year = ""
                         reference_page = ""
                     elif (previously_cited_item is not None) and \
                          (previously_cited_item['type'] == "REPORTNUMBER") and \
                          (len(cur_misc_txt.lower().replace("arxiv", "").strip(".,:;- []")) == 0):
                         ## This TITLE belongs with the REPORT NUMBER before it - add them both into
                         ## the same datafield tag (REPORT NUMBER first, TITLE second):
                         prev_report_num = previously_cited_item['report_num']
                         prev_misc_txt   = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                         xml_line += \
                            _refextract_markup_title_followed_by_report_number_as_marcxml(title_text,
                                                                                          reference_volume,
                                                                                          reference_year,
                                                                                          reference_page,
                                                                                          prev_report_num,
                                                                                          prev_misc_txt)
                         ## Increment the stats counters:
 ##                         if len(prev_misc_txt) > 0:
 ##                             count_misc += 1
                         count_title += 1
                         count_reportnum += 1
 
                         ## reset the various variables:
                         previously_cited_item = None
                         cur_misc_txt = u""
                         title_text = ""
                         reference_volume = ""
                         reference_year = ""
                         reference_page = ""
                     else:
                         ## either the previously cited item is NOT a REPORT NUMBER, or this cited TITLE
                         ## is preceeded by miscellaneous text. In either case, the two cited objects are
                         ## not the same and do not belong together in the same datafield.
                         if previously_cited_item['type'] == "REPORTNUMBER":
                             ## previously cited item was a REPORT NUMBER.
                             ## Add previously cited REPORT NUMBER to XML string:
                             prev_report_num = previously_cited_item['report_num']
                             prev_misc_txt   = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                             xml_line += _refextract_markup_reportnumber_as_marcxml(prev_report_num,
                                                                                    prev_misc_txt)
                             ## Increment the stats counters:
 ##                             if len(prev_misc_txt) > 0:
 ##                                 count_misc += 1
                             count_reportnum += 1
                         elif previously_cited_item['type'] == "TITLE":
                             ## previously cited item was a TITLE.
                             ## Add previously cited TITLE to XML string:
                             prev_title    = previously_cited_item['title']
                             prev_volume   = previously_cited_item['volume']
                             prev_year     = previously_cited_item['year']
                             prev_page     = previously_cited_item['page']
                             prev_misc_txt = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                             xml_line += _refextract_markup_title_as_marcxml(prev_title, prev_volume,
                                                                             prev_year, prev_page, prev_misc_txt)
                             ## Increment the stats counters:
 ##                             if len(prev_misc_txt) > 0:
 ##                                 count_misc += 1
                             count_title += 1
 
                         ## Now add the current cited item into the previously cited item marker
                         previously_cited_item = { 'type'       : "TITLE",
                                                   'misc_txt'   : cur_misc_txt,
                                                   'title'      : title_text,
                                                   'volume'     : reference_volume,
                                                   'year'       : reference_year,
                                                   'page'       : reference_page,
                                                 }
                         ## empty miscellaneous text
                         cur_misc_txt = u""
                         title_text = ""
                         reference_volume = ""
                         reference_year = ""
                         reference_page = ""
                 else:
                     ## No numeration was recognised after the title. Add the title into misc and carry on:
                     cur_misc_txt += " %s" % title_text
 
         elif tag_type == "REPORTNUMBER":
             ## This tag is an identified institutional report number:
             ## Account for the miscellaneous text before the citation:
             cur_misc_txt += processed_line[0:tag_match_start]
             ## extract the institutional report-number from the line:
             idx_closing_tag = processed_line.find(CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM, tag_match_end)
             ## Sanity check - did we find a closing report-number tag?
             if idx_closing_tag == -1:
                 ## no closing </cds.REPORTNUMBER> tag found - strip the opening tag and move past this
                 ## recognised reportnumber as it is unreliable:
                 processed_line = processed_line[tag_match_end:]
             else:
                 ## closing tag was found
                 report_num = processed_line[tag_match_end:idx_closing_tag]
                 ## now trim this matched institutional report-number and its tags from the start of the line:
                 processed_line = processed_line[idx_closing_tag+len(CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM):]
 
                 ## Now, if there was a previous TITLE citation and this REPORT NUMBER citation one has no
                 ## miscellaneous text after punctuation has been stripped, the two refer to the same object,
                 ## so group them under the same datafield:
                 if previously_cited_item is None:
                     ## There is no previously cited item - this should be added as the previously
                     ## cited item:
                     previously_cited_item = { 'type'       : "REPORTNUMBER",
                                               'misc_txt'   : "%s" % cur_misc_txt,
                                               'report_num' : "%s" % report_num,
                                             }
                     ## empty miscellaneous text
                     cur_misc_txt = u""
                     report_num = u""
                 elif (previously_cited_item is not None) and \
                      (previously_cited_item['type'] == "TITLE") and \
                      (len(cur_misc_txt.lower().replace("arxiv", "").strip(".,:;- []")) == 0):
                     ## This REPORT NUMBER belongs with the title before it - add them both into
                     ## the same datafield tag (TITLE first, REPORT NUMBER second):
                     prev_title    = previously_cited_item['title']
                     prev_volume   = previously_cited_item['volume']
                     prev_year     = previously_cited_item['year']
                     prev_page     = previously_cited_item['page']
                     prev_misc_txt = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                     xml_line += \
                       _refextract_markup_title_followed_by_report_number_as_marcxml(prev_title,
                                                                                     prev_volume,
                                                                                     prev_year,
                                                                                     prev_page,
                                                                                     report_num,
                                                                                     prev_misc_txt)
                     ## Increment the stats counters:
 ##                     if len(prev_misc_txt) > 0:
 ##                         count_misc += 1
                     count_title += 1
                     count_reportnum += 1
 
                     ## Reset variables:
                     previously_cited_item = None
                     cur_misc_txt = u""
                 else:
                     ## either the previously cited item is NOT a TITLE, or this cited REPORT NUMBER
                     ## is preceeded by miscellaneous text. In either case, the two cited objects are
                     ## not the same and do not belong together in the same datafield.
                     if previously_cited_item['type'] == "REPORTNUMBER":
                         ## previously cited item was a REPORT NUMBER.
                         ## Add previously cited REPORT NUMBER to XML string:
                         prev_report_num = previously_cited_item['report_num']
                         prev_misc_txt   = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                         xml_line += _refextract_markup_reportnumber_as_marcxml(prev_report_num,
                                                                                prev_misc_txt)
                         ## Increment the stats counters:
 ##                         if len(prev_misc_txt) > 0:
 ##                             count_misc += 1
                         count_reportnum += 1
                     elif previously_cited_item['type'] == "TITLE":
                         ## previously cited item was a TITLE.
                         ## Add previously cited TITLE to XML string:
                         prev_title    = previously_cited_item['title']
                         prev_volume   = previously_cited_item['volume']
                         prev_year     = previously_cited_item['year']
                         prev_page     = previously_cited_item['page']
                         prev_misc_txt = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                         xml_line += _refextract_markup_title_as_marcxml(prev_title, prev_volume,
                                                                         prev_year, prev_page, prev_misc_txt)
                         ## Increment the stats counters:
 ##                         if len(prev_misc_txt) > 0:
 ##                             count_misc += 1
                         count_title += 1
                     ## Now add the current cited item into the previously cited item marker
                     previously_cited_item = { 'type'       : "REPORTNUMBER",
                                               'misc_txt'   : "%s" % cur_misc_txt,
                                               'report_num' : "%s" % report_num,
                                             }
                     ## empty miscellaneous text
                     cur_misc_txt = u""
                     report_num = u""
 
         elif tag_type == "URL":
             ## This tag is an identified URL:
             ## Account for the miscellaneous text before the URL:
             cur_misc_txt += processed_line[0:tag_match_start]
             ## extract the URL information from within the tags in the line:
             idx_closing_tag = processed_line.find(CFG_REFEXTRACT_MARKER_CLOSING_URL, tag_match_end)
             ## Sanity check - did we find a closing URL tag?
             if idx_closing_tag == -1:
                 ## no closing </cds.URL> tag found - strip the opening tag and move past it
                 processed_line = processed_line[tag_match_end:]
             else:
                 ## Closing tag was found:
                 ## First, get the URL string from between the tags:
                 url_string = processed_line[tag_match_end:idx_closing_tag]
 
                 ## Now, get the URL description string from within the opening cds tag. E.g.:
                 ## from <cds.URL description="abc"> get the "abc" value:
                 opening_url_tag = processed_line[tag_match_start:tag_match_end]
                 if opening_url_tag.find(u"""<cds.URL description=\"""") != -1:
                     ## the description is present - extract it:
                     ## (Stop 2 characters before the end of the string - we assume they are the
                     ## closing characters '">'.
                     url_descr = opening_url_tag[22:-2]
                 else:
                     ## There is no description - description should now be the url string:
                     url_descr = url_string
                 ## now trim this URL and its tags from the start of the line:
                 processed_line = processed_line[idx_closing_tag+len(CFG_REFEXTRACT_MARKER_CLOSING_URL):]
 
                 ## Build the MARC XML representation of this identified URL:
                 if previously_cited_item is not None:
                     ## There was a previously cited item. We must convert it to XML before we can
                     ## convert this URL to XML:
                     if previously_cited_item['type'] == "REPORTNUMBER":
                         ## previously cited item was a REPORT NUMBER.
                         ## Add previously cited REPORT NUMBER to XML string:
                         prev_report_num = previously_cited_item['report_num']
                         prev_misc_txt   = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                         xml_line += _refextract_markup_reportnumber_as_marcxml(prev_report_num,
                                                                                prev_misc_txt)
                         ## Increment the stats counters:
 ##                         if len(prev_misc_txt) > 0:
 ##                             count_misc += 1
                         count_reportnum += 1
                     elif previously_cited_item['type'] == "TITLE":
                         ## previously cited item was a TITLE.
                         ## Add previously cited TITLE to XML string:
                         prev_title    = previously_cited_item['title']
                         prev_volume   = previously_cited_item['volume']
                         prev_year     = previously_cited_item['year']
                         prev_page     = previously_cited_item['page']
                         prev_misc_txt = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
                         xml_line += _refextract_markup_title_as_marcxml(prev_title, prev_volume,
                                                                         prev_year, prev_page, prev_misc_txt)
                         ## Increment the stats counters:
 ##                         if len(prev_misc_txt) > 0:
 ##                             count_misc += 1
                         count_title += 1
                     ## Empty the previously-cited item place-holder:
                     previously_cited_item = None
                 ## Now convert this URL to MARC XML
                 cur_misc_txt = cur_misc_txt.lstrip(".;, ").rstrip()
                 if url_string.find("http //") == 0:
                     url_string = u"http://" + url_string[7:]
                 elif url_string.find("ftp //") == 0:
                     url_string = u"ftp://" + url_string[6:]
                 if url_descr.find("http //") == 0:
                     url_descr = u"http://" + url_descr[7:]
                 elif url_descr.find("ftp //") == 0:
                     url_descr = u"ftp://" + url_descr[6:]
                 xml_line += \
                           _refextract_markup_url_as_marcxml(url_string, url_descr, cur_misc_txt)
                 ## Increment the stats counters:
 ##                 if len(cur_misc_txt) > 0:
 ##                     count_misc += 1
                 count_url += 1
                 cur_misc_txt = u""
 
         elif tag_type == "SER":
             ## This tag is a SERIES tag; Since it was not preceeded by a TITLE tag,
             ## it is useless - strip the tag and put it into miscellaneous:
             (cur_misc_txt, processed_line) = \
                          _convert_unusable_tag_to_misc(processed_line, cur_misc_txt, \
                                                        tag_match_start,tag_match_end,
                                                        CFG_REFEXTRACT_MARKER_CLOSING_SERIES)
 
         elif tag_type == "VOL":
             ## This tag is a VOLUME tag; Since it was not preceeded by a TITLE tag,
             ## it is useless - strip the tag and put it into miscellaneous:
             (cur_misc_txt, processed_line) = \
                          _convert_unusable_tag_to_misc(processed_line, cur_misc_txt, \
                                                        tag_match_start,tag_match_end,
                                                        CFG_REFEXTRACT_MARKER_CLOSING_VOLUME)
 
         elif tag_type == "YR":
             ## This tag is a YEAR tag; Since it's not preceeded by TITLE and VOLUME tags, it
             ## is useless - strip the tag and put the contents into miscellaneous:
             (cur_misc_txt, processed_line) = \
                          _convert_unusable_tag_to_misc(processed_line, cur_misc_txt, \
                                                        tag_match_start,tag_match_end,
                                                        CFG_REFEXTRACT_MARKER_CLOSING_YEAR)
 
         elif tag_type == "PG":
             ## This tag is a PAGE tag; Since it's not preceeded by TITLE, VOLUME and YEAR tags,
             ## it is useless - strip the tag and put the contents into miscellaneous:
             (cur_misc_txt, processed_line) = \
                          _convert_unusable_tag_to_misc(processed_line, cur_misc_txt, \
                                                        tag_match_start,tag_match_end,
                                                        CFG_REFEXTRACT_MARKER_CLOSING_PAGE)
 
         else:
             ## Unknown tag - discard as miscellaneous text:
             cur_misc_txt += processed_line[0:tag_match.end()]
             processed_line = processed_line[tag_match.end():]
 
         ## Look for the next tag in the processed line:
         tag_match = sre_tagged_citation.search(processed_line)
 
     ## If a previously cited item remains, convert it into MARC XML:
     if previously_cited_item is not None:
         if previously_cited_item['type'] == "REPORTNUMBER":
             ## previously cited item was a REPORT NUMBER.
             ## Add previously cited REPORT NUMBER to XML string:
             prev_report_num = previously_cited_item['report_num']
             prev_misc_txt   = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
             xml_line += _refextract_markup_reportnumber_as_marcxml(prev_report_num,
                                                                    prev_misc_txt)
             ## Increment the stats counters:
 ##             if len(prev_misc_txt) > 0:
 ##                 count_misc += 1
             count_reportnum += 1
         elif previously_cited_item['type'] == "TITLE":
             ## previously cited item was a TITLE.
             ## Add previously cited TITLE to XML string:
             prev_title    = previously_cited_item['title']
             prev_volume   = previously_cited_item['volume']
             prev_year     = previously_cited_item['year']
             prev_page     = previously_cited_item['page']
             prev_misc_txt = previously_cited_item['misc_txt'].lstrip(".;, ").rstrip()
             xml_line += _refextract_markup_title_as_marcxml(prev_title, prev_volume,
                                                             prev_year, prev_page, prev_misc_txt)
             ## Increment the stats counters:
 ##             if len(prev_misc_txt) > 0:
 ##                 count_misc += 1
             count_title += 1
         ## free up previously_cited_item:
         previously_cited_item = None
 
     ## place any remaining miscellaneous text into the appropriate MARC XML fields:
     cur_misc_txt += processed_line
     if len(cur_misc_txt.strip(" .;,")) > 0:
         ## The remaining misc text is not just a full-stop or semi-colon. Add it:
         xml_line += _refextract_markup_miscellaneous_text_as_marcxml(cur_misc_txt)
         ## Increment the stats counters:
         count_misc += 1
 
     ## return the reference-line as MARC XML:
     return (xml_line, count_misc, count_title, count_reportnum, count_url)
 
 def move_tagged_series_into_tagged_title(line):
     """Moves a marked-up series item into a marked-up title.
        E.g.: should change <cds.TITLE>Blah</cds.TITLE> <cds.SER>A</cds.SER>
         into:
        <cds.TITLE>Blah., A</cds.TITLE>
        @param line: (string) - the line in which a series tagged item is to be
         moved into title tags.
        @return: (string) - the line after the series items have been moved
         into the title tags.
     """
     ## Seek a marked-up series occurrence in line:
     m_tagged_series = sre_title_followed_by_series_markup_tags.search(line)
     while m_tagged_series is not None:
         ## tagged series found in line - try to remove it and put it into the title:
         entire_match = m_tagged_series.group(0) ## the entire match (e.g.<cds.TITLE>xxxxx</cds.TITLE <cds.SER>A</cds.SER>)
         title_match = m_tagged_series.group(2)  ## the string matched between <cds.TITLE></cds.TITLE> tags
         series_match = m_tagged_series.group(3) ## the series information matched between <cds.SER></cds.SER> tags.
         corrected_title_text = title_match
         ## If there is no comma in the matched title, add one to the end of it before the series info is
         ## added. If there is already a comma present, discard the series info (as there is already a series)
         if corrected_title_text.find(",") == -1:
             corrected_title_text = corrected_title_text.rstrip()
             if corrected_title_text[-1] == ".":
                 corrected_title_text += ", %s" % series_match
             else:
                 corrected_title_text += "., %s" % series_match
         elif corrected_title_text.rstrip()[-1:] == ",":
             ## There is a "," at the end of the title, but no series present.
             ## add it:
             corrected_title_text = corrected_title_text.rstrip() + " %s" % series_match
         line = sre.sub("%s" % sre.escape(entire_match), "<cds.TITLE>%s</cds.TITLE>" % corrected_title_text, line, 1)
         m_tagged_series = sre_title_followed_by_series_markup_tags.search(line)
     return line
 
 def _re_identify_numeration(line):
     """Look for other numeration in line.
     """
     ## First, attempt to use marked-up titles 
     line = sre_correct_numeration_2nd_try_ptn1[0].sub(sre_correct_numeration_2nd_try_ptn1[1], line)
     line = sre_correct_numeration_2nd_try_ptn2[0].sub(sre_correct_numeration_2nd_try_ptn2[1], line)
     return line
 
 def add_tagged_report_number(reading_line,
                              len_reportnum,
                              reportnum,
                              startpos,
                              true_replacement_index,
                              extras):
     """In rebuilding the line, add an identified institutional REPORT-NUMBER (standardised
        and tagged) into the line.
        @param reading_line: (string) The reference line before capitalization was performed, and
         before REPORT-NUMBERs and TITLEs were stipped out.
        @param len_reportnum: (integer) the length of the matched REPORT-NUMBER.
        @param reportnum: (string) the replacement text for the matched REPORT-NUMBER.
        @param startpos: (integer) the pointer to the next position in the reading-line
         from which to start rebuilding.
        @param true_replacement_index: (integer) the replacement index of the matched REPORT-
         NUMBER in the reading-line, with stripped punctuation and whitespace accounted for.
        @param extras: (integer) extras to be added into the replacement index.
        @return: (tuple) containing a string (the rebuilt line segment) and an
         integer (the next 'startpos' in the reading-line).
     """
     rebuilt_line = u""  ## The segment of the line that is being rebuilt to include the
                         ## tagged and standardised institutional REPORT-NUMBER
     
     ## Fill rebuilt_line with the contents of the reading_line up to the point of the
     ## institutional REPORT-NUMBER. However, stop 1 character before the replacement
     ## index of this REPORT-NUMBER to allow for removal of braces, if necessary:
     if (true_replacement_index - startpos - 1) >= 0:
         rebuilt_line += reading_line[startpos:true_replacement_index - 1]
     else:
         rebuilt_line += reading_line[startpos:true_replacement_index]
 
     ## check to see whether the REPORT-NUMBER was enclosed within brackets; drop them if so:
     if reading_line[true_replacement_index - 1] not in (u"[", u"("):
         ## no braces enclosing the REPORT-NUMBER:
         rebuilt_line += reading_line[true_replacement_index - 1]
 
     ## Add the tagged REPORT-NUMBER into the rebuilt-line segment:
     rebuilt_line += u"<cds.REPORTNUMBER>%(reportnum)s</cds.REPORTNUMBER>" \
                         % { 'reportnum' : reportnum }
 
     ## move the pointer in the reading-line past the current match:
     startpos = true_replacement_index + len_reportnum + extras
 
     ## Move past closing brace for report number (if there was one):
     try:
         if reading_line[startpos] in (u"]", u")"):
             startpos += 1
     except IndexError:
         ## moved past end of line - ignore
         pass
     
     ## return the rebuilt-line segment and the pointer to the next position in the
     ## reading-line from  which to start rebuilding up to the next match:
     return (rebuilt_line, startpos)
 
 def add_tagged_title_in_place_of_IBID(ibid_string,
                                       previous_match,
                                       ibid_series):
     """In rebuilding the line, if the matched TITLE was actually an IBID, this
        function will replace it with the previously matched TITLE, and add it
        into the line, tagged. It will even handle the series letter, if it differs.
        For example, if the previous match is "Nucl. Phys., B", and the ibid is
        "IBID A", the title inserted into the line will be "Nucl. Phys., A".
        Otherwise, if the IBID had no series letter, it will simply be replaced
        by "Nucl. Phys., B" (i.e. the previous match.)
        @param ibid_string: (string) - the matched IBID.
        @param previous_match: (string) - the previously matched TITLE.
        @param ibid_series: (string) - the series of the IBID (if any).
        @return: (tuple) containing a string (the rebuilt line segment) and an
         other string (the newly updated previous-match).
     """
     rebuilt_line = u""
     if ibid_series != "":
         ## This IBID has a series letter. If the previously matched TITLE also had a series letter
         ## and that series letter differs to the one carried by this IBID, the series letter stored in
         ## the previous-match must be updated to that of this IBID:
         if previous_match.find(",") != -1:
             ## Presence of comma in previous match could mean it has a series:
             m_previous_series = sre_title_series.search(previous_match)
             if m_previous_series is not None:
                 previous_series = m_previous_series.group(1)
                 if previous_series == ibid_series:
                     ## Both the previous match and this IBID have the same series
                     rebuilt_line += " <cds.TITLE>%(previous-match)s</cds.TITLE>" \
                                     % { 'previous-match' : previous_match }
                 else:
                     ## Previous match and this IBID do not have the same series
                     previous_match = sre.sub("(\\.?)(,?) %s$" % previous_series, \
                                              "\\g<1>\\g<2> %s" % ibid_series, previous_match)
                     rebuilt_line += " <cds.TITLE>%(previous-match)s</cds.TITLE>" \
                                     % { 'previous-match' : previous_match }
             else:
                 ## Series info of previous match is not a letter or roman numeral;
                 ## cannot be sure about meaning of IBID - dont replace it
                 rebuilt_line += ibid_string
         else:
             ## previous match had no series letter, but the IBID did. Add the a comma
             ## followed by a series letter to the end of the previous match
             ## Now add the previous match into the rebuilt-line:
             previous_match = previous_match.rstrip()
             if previous_match[-1] == ".":
                 ## Previous match ended with a full-stop. Add a comma, then the IBID series
                 previous_match += ", %(ibid-series)s" % { 'ibid-series' : ibid_series }
             else:
                 ## Previous match did not end with a full-stop. Add a full-stop then the comma,
                 ## then the IBID series
                 previous_match += "., %(ibid-series)s" % { 'ibid-series' : ibid_series }
             rebuilt_line += " <cds.TITLE>%(previous-match)s</cds.TITLE>" \
                            % { 'previous-match' : previous_match }
     else:
         ## IBID's series letter is empty - Replace as-is:
         rebuilt_line += " <cds.TITLE>%(previous-match)s</cds.TITLE>" \
                        % { 'previous-match' : previous_match }
     return (rebuilt_line, previous_match)
 
 def add_tagged_title(reading_line,
                      len_title,
                      matched_title,
                      previous_match,
                      startpos,
                      true_replacement_index,
                      extras,
                      standardised_titles):
     """In rebuilding the line, add an identified periodical TITLE (standardised and
        tagged) into the line.
        @param reading_line: (string) The reference line before capitalization was performed, and
         before REPORT-NUMBERs and TITLEs were stipped out.
        @param len_title: (integer) the length of the matched TITLE.
        @param matched_title: (string) the matched TITLE text.
        @previous_match: (string) the previous periodical TITLE citation to have been matched
         in the current reference line. It is used when replacing an IBID instance in the line.
        @param startpos: (integer) the pointer to the next position in the reading-line
         from which to start rebuilding.
        @param true_replacement_index: (integer) the replacement index of the matched TITLE
         in the reading-line, with stripped punctuation and whitespace accounted for.
        @param extras: (integer) extras to be added into the replacement index.
        @param standardised_titles: (dictionary) the standardised versions of periodical
         titles, keyed by their various non-standard versions.
        @return: (tuple) containing a string (the rebuilt line segment), an
         integer (the next 'startpos' in the reading-line), and an other string
         (the newly updated previous-match).
     """
     ## Fill 'rebuilt_line' (the segment of the line that is being rebuilt to include the
     ## tagged and standardised periodical TITLE) with the contents of the reading-line,
     ## up to the point of the matched TITLE:
     rebuilt_line = reading_line[startpos:true_replacement_index]
     ## Test to see whether a title or an "IBID" was matched:
     if matched_title.upper().find("IBID") != -1:
         ## This is an IBID
         ## Try to replace the IBID with a title:
         if previous_match != "":
             ## A title has already been replaced in this line - IBID can be replaced meaninfully
             ## First, try to get the series number/letter of this IBID:
             m_ibid = sre_matched_ibid.search(matched_title)
             try:
                 series = m_ibid.group(1)
             except IndexError:
                 series = u""
             if series is None:
                 series = u""
             ## Replace this IBID with the previous title match, if possible:
             (replaced_ibid_segment, previous_match) = \
                  add_tagged_title_in_place_of_IBID(matched_title,
                                                    previous_match, series)
             rebuilt_line += replaced_ibid_segment
             ## Update start position for next segment of original line:
             startpos = true_replacement_index + len_title + extras
 
             ## Skip past any punctuation at the end of the replacement that was just made:
             try:
                 if reading_line[startpos] in (".", ":", ";"):
                     startpos += 1
             except IndexError:
                 ## The match was at the very end of the line
                 pass
         else:
             ## no previous title-replacements in this line - IBID refers to something unknown and
             ## cannot be replaced:
             rebuilt_line += \
                 reading_line[true_replacement_index:true_replacement_index + len_title + extras]
             startpos = true_replacement_index + len_title + extras
     else:
         ## This is a normal title, not an IBID
         rebuilt_line += "<cds.TITLE>%(title)s</cds.TITLE>" % { 'title' : standardised_titles[matched_title] }
         previous_match = standardised_titles[matched_title]
         startpos = true_replacement_index + len_title + extras
         ## Skip past any punctuation at the end of the replacement that was just made:
         try:
             if reading_line[startpos] in (".", ":", ";"):
                 startpos += 1
         except IndexError:
             ## The match was at the very end of the line
             pass
         try:
             if reading_line[startpos] == ")":
                 startpos += 1
         except IndexError:
             ## The match was at the very end of the line
             pass
 
     ## return the rebuilt line-segment, the position (of the reading line) from which the
     ## next part of the rebuilt line should be started, and the newly updated previous match.
     return (rebuilt_line, startpos, previous_match)
 
 def create_marc_xml_reference_section(ref_sect,
                                       preprint_repnum_search_kb,
                                       preprint_repnum_standardised_categs,
                                       periodical_title_search_kb,
                                       standardised_periodical_titles,
                                       periodical_title_search_keys):
     """Passed a complete reference section, process each line and attempt to identify and standardise
        individual citations within the line.
        @param ref_sect: (list) of strings - each string in the list is a reference line.
        @param preprint_repnum_search_kb: (dictionary) - keyed by a tuple (containing the line-number
                                                of the pattern in the KB and the non-standard
                                                category string.  E.g.: (3, 'ASTRO PH'). Value is
                                                regexp pattern used to search for that report-number.
        @param preprint_repnum_standardised_categs: (dictionary) - keyed by non-standard version of
                                                institutional report number, value is the standardised
                                                version of that report number.
        @param periodical_title_search_kb: (dictionary) - keyed by non-standard title to search for,
                                                value is the compiled regexp pattern used to search
                                                for that title.
        @param standardised_periodical_titles: (dictionary) - keyed by non-standard title to search for,
                                                value is the standardised version of that title.
        @param periodical_title_search_keys: (list) - ordered list of non-standard titles to search for.
        @return: (tuple) of 5 components:
                   ( list    -> of strings, each string is a MARC XML-ized reference line.
                     integer -> number of fields of miscellaneous text found for the record.
                     integer -> number of title citations found for the record.
                     integer -> number of institutional report-number citations found for
                      the record.
                     integer -> number of URL citations found for the record.
                   )
     """
     ## a list to contain the processed reference lines:
     xml_ref_sectn = []
     ## counters for extraction stats:
     count_misc = count_title = count_reportnum = count_url = 0
 
     ## process references line-by-line:
     for ref_line in ref_sect:
         ## initialise some variables:
         ## dictionaries to record information about, and coordinates of, matched IBID items:
         found_ibids_len = {}
         found_ibids_matchtext = {}
         ## dictionaries to record information about, and  coordinates of, matched journal title items:
         found_title_len = {}
         found_title_matchtext = {}
         ## dictionaries to record information about, and the coordinates of, matched preprint report
         ## number items
         found_pprint_repnum_matchlens     = {}    ## lengths of given matches of preprint report numbers
         found_pprint_repnum_replstr       = {}    ## standardised replacement strings for preprint report numbers
                                                   ## to be substituted into a line
 
         ## take a copy of the line as a first working line, clean it of bad accents, and correct puncutation, etc:
         working_line1 = wash_line(ref_line)
 
         ## Identify and standardise numeration in the line:
         working_line1 = standardize_and_markup_numeration_of_citations_in_line(working_line1)
 
         ## Identify and replace URLs in the line:
         working_line1 = identify_and_tag_URLs(working_line1)
 
         ## Clean the line once more:
         working_line1 = wash_line(working_line1)
 
         ## Transform the line to upper-case, now making a new working line:
         working_line2 = working_line1.upper()
 
         ## Strip punctuation from the line:
         working_line2 = sre_punctuation.sub(u' ', working_line2)
 
         ## Remove multiple spaces from the line, recording information about their coordinates:
         (removed_spaces, working_line2) = remove_and_record_multiple_spaces_in_line(working_line2)
 
         ## Identify and record coordinates of institute preprint report numbers:
         (found_pprint_repnum_matchlens, found_pprint_repnum_replstr, working_line2) = \
                                     identify_preprint_report_numbers(working_line2,
                                                                      preprint_repnum_search_kb,
                                                                      preprint_repnum_standardised_categs)
 
         ## Identify and record coordinates of non-standard journal titles:
         (found_title_len, found_title_matchtext, working_line2) = \
                           identify_periodical_titles(working_line2,
                                                      periodical_title_search_kb,
                                                      periodical_title_search_keys)
 
         ## Attempt to identify, record and replace any IBIDs in the line:
         if working_line2.upper().find(u"IBID") != -1:
             ## there is at least one IBID in the line - try to identify its meaning:
             (found_ibids_len, found_ibids_matchtext, working_line2) = identify_ibids(working_line2)
             ## now update the dictionary of matched title lengths with the matched IBID(s) lengths information:
             found_title_len.update(found_ibids_len)
             found_title_matchtext.update(found_ibids_matchtext)
 
         ## Using the recorded information, create a MARC XML representation of the rebuilt line:
         ## At the same time, get stats of citations found in the reference line (titles, urls, etc):
         (xml_line, this_count_misc, this_count_title, \
          this_count_reportnum, this_count_url) = \
              create_marc_xml_reference_line(working_line=working_line1,
                                             found_title_len=found_title_len,
                                             found_title_matchtext=found_title_matchtext,
                                             pprint_repnum_len=found_pprint_repnum_matchlens,
                                             pprint_repnum_matchtext=found_pprint_repnum_replstr,
                                             removed_spaces=removed_spaces,
                                             standardised_titles=standardised_periodical_titles)
         count_misc      += this_count_misc
         count_title     += this_count_title
         count_reportnum += this_count_reportnum
         count_url       += this_count_url
 
         ## Append the rebuilt line details to the list of MARC XML reference lines:
         xml_ref_sectn.append(xml_line)
 
     ## Return thereturn  list of processed reference lines:
     return (xml_ref_sectn, count_misc, count_title, count_reportnum, count_url)
 
 
 ## Tasks related to extraction of reference section from full-text:
 
 ## ----> 1. Removing page-breaks, headers and footers before searching for reference section:
 
 def strip_headers_footers_pagebreaks(docbody, page_break_posns, num_head_lines, num_foot_lines):
     """Remove page-break lines, header lines, and footer lines from the document.
        @param docbody: (list) of strings, whereby each string in the list is a line in the document.
        @param page_break_posns: (list) of integers, whereby each integer represents the index in docbody
         at which a page-break is found.
        @param num_head_lines: (int) the number of header lines each page in the document has.
        @param num_foot_lines: (int) the number of footer lines each page in the document has.
        @return: (list) of strings - the document body after the headers, footers, and page-break lines
         have been stripped from the list.
     """
     num_breaks = (len(page_break_posns))
     page_lens = []
     for x in xrange(0, num_breaks):
         if x < num_breaks - 1:
             page_lens.append(page_break_posns[x + 1] - page_break_posns[x])
     page_lens.sort()
     if (len(page_lens) > 0) and (num_head_lines + num_foot_lines + 1 < page_lens[0]):
         ## Safe to chop hdrs & ftrs
         page_break_posns.reverse()
         first = 1
         for i in xrange(0, len(page_break_posns)):
             ## Unless this is the last page break, chop headers
             if not first:
                 for dummy in xrange(1, num_head_lines + 1):
                     docbody[page_break_posns[i] + 1:page_break_posns[i] + 2] = []
             else:
                 first = 0
             ## Chop page break itself
             docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
             ## Chop footers (unless this is the first page break)
             if i != len(page_break_posns) - 1:
                 for dummy in xrange(1, num_foot_lines + 1):
                     docbody[page_break_posns[i] - num_foot_lines:page_break_posns[i] - num_foot_lines + 1] = []
     return docbody
 
 def check_boundary_lines_similar(l_1, l_2):
     """Compare two lists to see if their elements are roughly the same.
     @param l_1: (list) of strings.
     @param l_2: (list) of strings.
     @return: (int) 1/0.
     """
     num_matches = 0
     if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
         ## these 'boundaries' are not similar
         return 0
     
     num_elements = len(l_1)
     for i in xrange(0, num_elements):
         if l_1[i].isdigit() and l_2[i].isdigit():
             ## both lines are integers
             num_matches = num_matches + 1
         else:
             l1_str = l_1[i].lower()
             l2_str = l_2[i].lower()
             if (l1_str[0] == l2_str[0]) and (l1_str[len(l1_str) - 1] == l2_str[len(l2_str) - 1]):
                 num_matches = num_matches + 1
     if (len(l_1) == 0) or (float(num_matches) / float(len(l_1)) < 0.9):
         return 0
     else:
         return 1
 
 def get_number_header_lines(docbody, page_break_posns):
     """Try to guess the number of header lines each page of a document has.
        The positions of the page breaks in the document are used to try to guess
        the number of header lines.
        @param docbody: (list) of strings - each string being a line in the document
        @param page_break_posns: (list) of integers - each integer is the position of a
         page break in the document.
        @return: (int) the number of lines that make up the header of each page.
     """
     remaining_breaks = (len(page_break_posns) - 1)
     num_header_lines = empty_line = 0
     ## pattern to search for a word in a line:
     p_wordSearch = sre.compile(unicode(r'([A-Za-z0-9-]+)'), sre.UNICODE)
     if remaining_breaks > 2:
         if remaining_breaks > 3:
             # Only check odd page headers
             next_head = 2
         else:
             # Check headers on each page
             next_head = 1
         keep_checking = 1
         while keep_checking:
             cur_break = 1
             if docbody[(page_break_posns[cur_break] + num_header_lines + 1)].isspace():
                 ## this is a blank line
                 empty_line = 1
             
             if (page_break_posns[cur_break] + num_header_lines + 1) == (page_break_posns[(cur_break + 1)]):
                 # Have reached next page-break: document has no body - only head/footers!
                 keep_checking = 0
             
             grps_headLineWords = p_wordSearch.findall(docbody[(page_break_posns[cur_break] + num_header_lines + 1)])
             cur_break = cur_break + next_head
             while (cur_break < remaining_breaks) and keep_checking:
                 grps_thisLineWords = p_wordSearch.findall(docbody[(page_break_posns[cur_break]+ num_header_lines + 1)])
                 if empty_line:
                     if len(grps_thisLineWords) != 0:
                         ## This line should be empty, but isn't
                         keep_checking = 0
                 else:
                     if (len(grps_thisLineWords) == 0) or (len(grps_headLineWords) != len(grps_thisLineWords)):
                         ## Not same num 'words' as equivilent line in 1st header:
                         keep_checking = 0
                     else:
                         keep_checking = check_boundary_lines_similar(grps_headLineWords, grps_thisLineWords)
                 ## Update cur_break for nxt line to check
                 cur_break = cur_break + next_head
             if keep_checking:
                 ## Line is a header line: check next
                 num_header_lines = num_header_lines + 1
             empty_line = 0
     return num_header_lines
 
 def get_number_footer_lines(docbody, page_break_posns):
     """Try to guess the number of footer lines each page of a document has.
        The positions of the page breaks in the document are used to try to guess
        the number of footer lines.
        @param docbody: (list) of strings - each string being a line in the document
        @param page_break_posns: (list) of integers - each integer is the position of a
         page break in the document.
        @return: (int) the number of lines that make up the footer of each page.
     """
     num_breaks = (len(page_break_posns))
     num_footer_lines = 0
     empty_line = 0
     keep_checking = 1
     p_wordSearch = sre.compile(unicode(r'([A-Za-z0-9-]+)'), sre.UNICODE)
     if num_breaks > 2:
         while keep_checking:
             cur_break = 1
             if docbody[(page_break_posns[cur_break] - num_footer_lines - 1)].isspace():
                 empty_line = 1
             grps_headLineWords = p_wordSearch.findall(docbody[(page_break_posns[cur_break] - num_footer_lines - 1)])
             cur_break = cur_break + 1
             while (cur_break < num_breaks) and keep_checking:
                 grps_thisLineWords = p_wordSearch.findall(docbody[(page_break_posns[cur_break] - num_footer_lines - 1)])
                 if empty_line:
                     if len(grps_thisLineWords) != 0:
                         ## this line should be empty, but isn't
                         keep_checking = 0
                 else:
                     if (len(grps_thisLineWords) == 0) or (len(grps_headLineWords) != len(grps_thisLineWords)):
                         ## Not same num 'words' as equivilent line in 1st footer:
                         keep_checking = 0
                     else:
                         keep_checking = check_boundary_lines_similar(grps_headLineWords, grps_thisLineWords)
                 ## Update cur_break for nxt line to check
                 cur_break = cur_break + 1
             if keep_checking:
                 ## Line is a footer line: check next
                 num_footer_lines = num_footer_lines + 1
             empty_line = 0
     return num_footer_lines
 
 def get_page_break_positions(docbody):
     """Locate page breaks in the list of document lines and create a list positions in the
        document body list.
        @param docbody: (list) of strings - each string is a line in the document.
        @return: (list) of integer positions, whereby each integer represents the position (in
         the document body) of a page-break.
     """
     page_break_posns = []
     p_break = sre.compile(unicode(r'^\s*?\f\s*?$'), sre.UNICODE)
     num_document_lines = len(docbody)
     for i in xrange(num_document_lines):
         if p_break.match(docbody[i]) != None:
             page_break_posns.append(i)
     return page_break_posns
 
 def document_contains_text(docbody):
     """Test whether document contains text, or is just full of worthless whitespace.
        @param docbody: (list) of strings - each string being a line of the document's body
        @return: (integer) 1 if non-whitespace found in document; 0 if only whitespace found in document.
     """
     found_non_space = 0
     for line in docbody:
         if not line.isspace():
             ## found a non-whitespace character in this line
             found_non_space = 1
             break
     return found_non_space
 
 def remove_page_boundary_lines(docbody):
     """Try to locate page breaks, headers and footers within a document body, and remove
        the array cells at which they are found.
        @param docbody: (list) of strings, each string being a line in the document's body.
        @return: (list) of strings. The document body, hopefully with page-breaks, headers
         and footers removed. Each string in the list once more represents a line in the
         document.
     """
     number_head_lines = number_foot_lines = 0
     ## Make sure document not just full of whitespace:
     if not document_contains_text(docbody):
         ## document contains only whitespace - cannot safely strip headers/footers
         return docbody
     
     ## Get list of index posns of pagebreaks in document:
     page_break_posns = get_page_break_positions(docbody)
     
     ## Get num lines making up each header if poss:
     number_head_lines = get_number_header_lines(docbody, page_break_posns)
 
     ## Get num lines making up each footer if poss:
     number_foot_lines = get_number_footer_lines(docbody, page_break_posns)
 
     ## Remove pagebreaks,headers,footers:
     docbody = strip_headers_footers_pagebreaks(docbody, page_break_posns, number_head_lines, number_foot_lines)
 
     return docbody
 
 ## ----> 2. Finding reference section in full-text:
 
 def _create_regex_pattern_add_optional_spaces_to_word_characters(word):
     """Add the regex special characters (\s*?) to allow optional spaces between the
        characters in a word.
        @param word: (string) the word to be inserted into a regex pattern.
        @return: string: the regex pattern for that word with optional spaces between all
         of its characters.
     """
     new_word = u""
     for ch in word:
         if ch.isspace():
             new_word += ch
         else:
             new_word += ch + unicode(r'\s*?')
     return new_word
 
 
 def get_reference_section_title_patterns():
     """Return a list of compiled regex patterns used to search for the title of a reference section
        in a full-text document.
        @return: (list) of compiled regex patterns.
     """
     patterns = []
     titles = [ u'references',
                u'r\u00C9f\u00E9rences',
                u'r\u00C9f\u00C9rences',
                u'reference',
                u'refs',
                u'r\u00E9f\u00E9rence',
                u'r\u00C9f\u00C9rence',
                u'r\xb4ef\xb4erences',
                u'r\u00E9fs',
                u'r\u00C9fs',
                u'bibliography',
                u'bibliographie',
                u'citations' ]
     sect_marker = unicode(r'^\s*?([\[\-\{\(])?\s*?((\w|\d){1,5}([\.\-\,](\w|\d){1,5})?\s*?[\.\-\}\)\]]\s*?)?(?P<title>')
     line_end  = unicode(r'(\s+?s\s*?e\s*?c\s*?t\s*?i\s*?o\s*?n\s*?)?)')
     line_end += unicode(r'($|\s*?[\[\{\(\<]\s*?[1a-z]\s*?[\}\)\>\]]|\:)')
 
     for t in titles:
         if len(t) > 0:
             ## don't append empty titles:
             t_ptn = sre.compile(sect_marker + \
                                 _create_regex_pattern_add_optional_spaces_to_word_characters(t) + \
                                 line_end, sre.I|sre.UNICODE)
             patterns.append(t_ptn)
     return patterns
 
 
 def get_reference_line_numeration_marker_patterns(prefix=u''):
     """Return a list of compiled regex patterns used to search for the marker of a reference line
        in a full-text document.
        @param prefix: (string) the possible prefix to a reference line
        @return: (list) of compiled regex patterns.
     """
     compiled_ptns = []
     title = u""
     if type(prefix) in (str, unicode):
         title = prefix
     g_name = unicode(r'(?P<mark>')
     g_close = u')'
     space = unicode(r'\s*?')
-    patterns = [ space + title + g_name + unicode(r'\[\s*?(?P<linenumber>\d+)\s*?\]') + g_close,
+    patterns = [ space + title + g_name + unicode(r'\[\s*?(?P<marknum>\d+)\s*?\]') + g_close,
                  space + title + g_name + unicode(r'\[\s*?[a-zA-Z]+\s?(\d{1,4}[A-Za-z]?)?\s*?\]') + g_close,
-                 space + title + g_name + unicode(r'\{\s*?\d+\s*?\}') + g_close,
-                 space + title + g_name + unicode(r'\<\s*?\d+\s*?\>') + g_close,
-                 space + title + g_name + unicode(r'\(\s*?\d+\s*?\)') + g_close,
+                 space + title + g_name + unicode(r'\{\s*?(?P<marknum>\d+)\s*?\}') + g_close,
+                 space + title + g_name + unicode(r'\<\s*?(?P<marknum>\d+)\s*?\>') + g_close,
+                 space + title + g_name + unicode(r'\(\s*?(?P<marknum>\d+)\s*?\)') + g_close,
                  space + title + g_name + unicode(r'(?P<marknum>\d+)\s*?\.') + g_close,
-                 space + title + g_name + unicode(r'\d+\s*?') + g_close,
-                 space + title + g_name + unicode(r'\d+\s*?\]') + g_close,
-                 space + title + g_name + unicode(r'\d+\s*?\}') + g_close,
-                 space + title + g_name + unicode(r'\d+\s*?\)') + g_close,
-                 space + title + g_name + unicode(r'\d+\s*?\>') + g_close,
+                 space + title + g_name + unicode(r'(?P<marknum>\d+)\s*?') + g_close,
+                 space + title + g_name + unicode(r'(?P<marknum>\d+)\s*?\]') + g_close,
+                 space + title + g_name + unicode(r'(?P<marknum>\d+)\s*?\}') + g_close,
+                 space + title + g_name + unicode(r'(?P<marknum>\d+)\s*?\)') + g_close,
+                 space + title + g_name + unicode(r'(?P<marknum>\d+)\s*?\>') + g_close,
                  space + title + g_name + unicode(r'\[\s*?\]') + g_close,
                  space + title + g_name + unicode(r'\*') + g_close ]
     for p in patterns:
         compiled_ptns.append(sre.compile(p, sre.I|sre.UNICODE))
     return compiled_ptns
 
 def get_first_reference_line_numeration_marker_patterns():
     """Return a list of compiled regex patterns used to search for the first reference line
        in a full-text document.
        The line is considered to start with either: [1] or {1}
        @return: (list) of compiled regex patterns.
     """
     compiled_patterns = []
     g_name = unicode(r'(?P<mark>')
     g_close = u')'
     patterns = [ g_name + unicode(r'(?P<left>\[)\s*?(?P<num>\d+)\s*?(?P<right>\])') + g_close,
                  g_name + unicode(r'(?P<left>\{)\s*?(?P<num>\d+)\s*?(?P<right>\})') + g_close ]
     for p in patterns:
         compiled_patterns.append(sre.compile(p, sre.I|sre.UNICODE))
     return compiled_patterns
 
 def get_post_reference_section_title_patterns():
     """Return a list of compiled regex patterns used to search for the title of the section
        after the reference section in a full-text document.
        @return: (list) of compiled regex patterns.
     """
     compiled_patterns = []
     thead = unicode(r'^\s*?([\{\(\<\[]?\s*?(\w|\d)\s*?[\)\}\>\.\-\]]?\s*?)?')
     ttail = unicode(r'(\s*?\:\s*?)?')
     numatn = unicode(r'(\d+|\w\b|i{1,3}v?|vi{0,3})[\.\,]?\b')
     ## Section titles:
     patterns = [ thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'appendix') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'appendices') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'acknowledgement') + unicode(r's?') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'table') + unicode(r'\w?s?\d?') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'figure') + unicode(r's?') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'annex') + unicode(r's?') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'discussion') + unicode(r's?') + ttail,
                  thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'remercie') + unicode(r's?') + ttail,
                  ## Figure nums:
                  r'^\s*?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'figure') + numatn,
                  r'^\s*?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'fig') + unicode(r'\.\s*?') + numatn,
                  r'^\s*?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'fig') + unicode(r'\.?\s*?\d\w?\b'),
                  ## tables:
                  r'^\s*?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'table') + numatn,
                  r'^\s*?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'tab') + unicode(r'\.\s*?') + numatn,
                  r'^\s*?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'tab') + unicode(r'\.?\s*?\d\w?\b') ]
     for p in patterns:
         compiled_patterns.append(sre.compile(p, sre.I|sre.UNICODE))
     return compiled_patterns
 
 def get_post_reference_section_keyword_patterns():
     """Return a list of compiled regex patterns used to search for various keywords that can often be found after,
        and therefore suggest the end of, a reference section in a full-text document.
        @return: (list) of compiled regex patterns.
     """
     compiled_patterns = []
     patterns = [ unicode(r'(') + _create_regex_pattern_add_optional_spaces_to_word_characters(u'prepared') + \
                                  unicode(r'|') + _create_regex_pattern_add_optional_spaces_to_word_characters(u'created') + \
                                  unicode(r').*?(AAS\s*?)?\sLATEX'),
                  unicode(r'AAS\s+?LATEX\s+?') + _create_regex_pattern_add_optional_spaces_to_word_characters(u'macros') + u'v',
                  unicode(r'^\s*?') + _create_regex_pattern_add_optional_spaces_to_word_characters(u'This paper has been produced using'),
                  unicode(r'^\s*?') + \
                                  _create_regex_pattern_add_optional_spaces_to_word_characters(u'This article was processed by the author using Springer-Verlag') + \
                                  u' LATEX' ]
     for p in patterns:
         compiled_patterns.append(sre.compile(p, sre.I|sre.UNICODE))
     return compiled_patterns
 
 def perform_regex_match_upon_line_with_pattern_list(line, patterns):
     """Given a list of COMPILED regex patters, perform the "sre.match" operation on the line for every pattern.
        Break from searching at the first match, returning the match object.  In the case that no patterns match,
        the None type will be returned.
        @param line: (unicode string) to be searched in.
        @param patterns: (list) of compiled regex patterns to search  "line" with.
        @return: (None or an sre.match object), depending upon whether one of the patterns matched within line
         or not.
     """
     if type(patterns) not in (list, tuple):
         raise TypeError()
     if type(line) not in (str, unicode):
         raise TypeError()
     
     m = None
     for ptn in patterns:
         m = ptn.match(line)
         if m is not None:
             break
     return m
 
 def perform_regex_search_upon_line_with_pattern_list(line, patterns):
     """Given a list of COMPILED regex patters, perform the "sre.search" operation on the line for every pattern.
        Break from searching at the first match, returning the match object.  In the case that no patterns match,
        the None type will be returned.
        @param line: (unicode string) to be searched in.
        @param patterns: (list) of compiled regex patterns to search  "line" with.
        @return: (None or an sre.match object), depending upon whether one of the patterns matched within line
         or not.
     """
     if type(patterns) not in (list, tuple):
         raise TypeError()
     if type(line) not in (str, unicode):
         raise TypeError()
     
     m = None
     for ptn in patterns:
         m = ptn.search(line)
         if m is not None:
             break
     return m
 
 
 def find_reference_section(docbody):
     """Search in document body for its reference section. More precisely, find the
        first line of the reference section. Effectively, the function starts at the
        end of a document and works backwards, line-by-line, looking for the title of
        a reference section. It stops when (if) it finds something that it considers
        to be the first line of a reference section.
        @param docbody: (list) of strings - the full document body.
        @return: (dictionary) :
                  { 'start_line' : (integer) - index in docbody of first reference line,
                    'title_string' : (string) - title of the reference section.
                    'marker' : (string) - the marker of the first reference line,
                    'marker_pattern' : (string) - the regexp string used to find the marker,
                    'title_marker_same_line' : (integer) - a flag to indicate whether the
                                                reference section title was on the same line
                                                as the first reference line's marker or not.
                                                1 if it was; 0 if it was not.
                  }
                  Much of this information is used by later functions to rebuild a reference
                  section.
          -- OR --
                 (None) - when the reference section could not be found.
     """
     ref_start_line = ref_title = ref_line_marker = ref_line_marker_ptn = None
     title_marker_same_line = found_part = None
     if len(docbody) > 0:
         title_patterns = get_reference_section_title_patterns()
         marker_patterns = get_reference_line_numeration_marker_patterns()
         p_num = sre.compile(unicode(r'(\d+)'))
         
         ## Try to find refs section title:
         x = len(docbody) - 1
         found_title = 0
         while x >= 0 and not found_title:
             title_match = perform_regex_search_upon_line_with_pattern_list(docbody[x], title_patterns)
             if title_match is not None:
                 temp_ref_start_line = x
                 temp_title = title_match.group('title')
                 mk_with_title_ptns = get_reference_line_numeration_marker_patterns(temp_title)
                 mk_with_title_match = perform_regex_search_upon_line_with_pattern_list(docbody[x], mk_with_title_ptns)
                 if mk_with_title_match is not None:
                     mk = mk_with_title_match.group('mark')
                     mk_ptn = mk_with_title_match.re.pattern
                     m_num = p_num.search(mk)
                     if m_num is not None and int(m_num.group(0)) == 1:
                         # Mark found.
                         found_title = 1
                         ref_title = temp_title
                         ref_line_marker = mk
                         ref_line_marker_ptn = mk_ptn
                         ref_start_line = temp_ref_start_line
                         title_marker_same_line = 1
                     else:
                         found_part = 1
                         ref_start_line = temp_ref_start_line
                         ref_line_marker = mk
                         ref_line_marker_ptn = mk_ptn
                         ref_title = temp_title
                         title_marker_same_line = 1
                 else:
                     try:
                         y = x + 1
                         ## Move past blank lines
                         while docbody[y].isspace() and y < len(docbody):
                             y = y + 1
                         ## Is this line numerated like a reference line?
                         mark_match = perform_regex_match_upon_line_with_pattern_list(docbody[y], marker_patterns)
                         if mark_match is not None:
                             ## Ref line found. What is it?
                             title_marker_same_line = None
                             mark = mark_match.group('mark')
                             mk_ptn = mark_match.re.pattern
                             m_num = p_num.search(mark)
                             if m_num is not None and int(m_num.group(0)) == 1:
                                 # 1st ref truly found
                                 ref_start_line = temp_ref_start_line
                                 ref_line_marker = mark
                                 ref_line_marker_ptn = mk_ptn
                                 ref_title = temp_title
                                 found_title = 1
                             elif m_num is not None and m_num.groups(0) != 1:
                                 found_part = 1
                                 ref_start_line = temp_ref_start_line
                                 ref_line_marker = mark
                                 ref_line_marker_ptn = mk_ptn
                                 ref_title = temp_title
                             else:
                                 if found_part:
                                     found_title = 1
                                 else:
                                     found_part = 1
                                     ref_start_line = temp_ref_start_line
                                     ref_title=temp_title
                                     ref_line_marker = mark
                                     ref_line_marker_ptn = mk_ptn
                         else:
                             ## No numeration
                             if found_part:
                                 found_title = 1
                             else:
                                 found_part = 1
                                 ref_start_line = temp_ref_start_line
                                 ref_title=temp_title
                     except IndexError:
                         ## References section title was on last line for some reason. Ignore
                         pass
             x = x - 1
     if ref_start_line is not None:
         ## return dictionary containing details of reference section:
         ref_sectn_details = { 'start_line' : ref_start_line,
                               'title_string' : ref_title,
                               'marker' : ref_line_marker,
                               'marker_pattern' : ref_line_marker_ptn,
                               'title_marker_same_line' : (title_marker_same_line is not None and 1) or (0)
                             }
     else:
         ref_sectn_details = None
     return ref_sectn_details
 
 def find_reference_section_no_title(docbody):
     """This function would generally be used when it was not possible to locate the start of a
        document's reference section by means of its title.  Instead, this function will look for
        reference lines that have numeric markers of the format [1], [2], etc.
        @param docbody: (list) of strings - each string is a line in the document.
        @return: (dictionary) :
                  { 'start_line' : (integer) - index in docbody of first reference line,
                    'title_string' : (None) - title of the reference section None since no title,
                    'marker' : (string) - the marker of the first reference line,
                    'marker_pattern' : (string) - the regexp string used to find the marker,
                    'title_marker_same_line' : (integer) 0 - to signal title not on same line as
                                                marker.
                  }
                  Much of this information is used by later functions to rebuild a reference
                  section.
          -- OR --
                 (None) - when the reference section could not be found.
     """
     ref_start_line = ref_line_marker = None
     if len(docbody) > 0:
         marker_patterns = get_first_reference_line_numeration_marker_patterns()
 
         ## try to find first reference line in the reference section:
         x = len(docbody) - 1
         found_ref_sect = 0
         while x >= 0 and not found_ref_sect:
             mark_match = perform_regex_match_upon_line_with_pattern_list(docbody[x], marker_patterns)
             if mark_match is not None and int(mark_match.group('num')) == 1:
                 ## Get marker recognition pattern:
                 mk_ptn = mark_match.re.pattern
 
                 ## Look for [2] in next 10 lines:
                 next_test_lines = 10
                 y = x + 1
                 temp_found = 0
                 while y < len(docbody) and y < x + next_test_lines and not temp_found:
                     mark_match2 = perform_regex_match_upon_line_with_pattern_list(docbody[y], marker_patterns)
                     if (mark_match2 is not None) and (int(mark_match2.group('num')) == 2) and \
                            (mark_match.group('left') == mark_match2.group('left')) and (mark_match.group('right') == mark_match2.group('right')):
                         ## Found next reference line:
                         temp_found = 1
                     elif y == len(docbody) - 1:
                         temp_found = 1
                     y = y + 1
 
                 if temp_found:
                     found_ref_sect = 1
                     ref_start_line = x
                     ref_line_marker = mark_match.group('mark')
                     ref_line_marker_ptn = mk_ptn
             x = x - 1
     if ref_start_line is not None:
         ref_sectn_details = { 'start_line' : ref_start_line,
                               'title_string' : None,
                               'marker' : ref_line_marker,
                               'marker_pattern' : ref_line_marker_ptn,
                               'title_marker_same_line' : 0
                             }
     else:
         ## didn't manage to find the reference section
         ref_sectn_details = None
     return ref_sectn_details
 
 
 def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn):
     """Given that the start of a document's reference section has already been recognised, this
        function is tasked with finding the line-number in the document of the last line of the
        reference section.
        @param docbody: (list) of strings - the entire plain-text document body.
        @param ref_start_line: (integer) - the index in docbody of the first line of the reference section.
        @param ref_line_marker: (string) - the line marker of the first reference line.
        @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker.
        @return: (integer) - index in docbody of the last reference line
          -- OR --
                 (None) - if ref_start_line was invalid.
     """
     section_ended = 0
     x = ref_start_line
     if (type(x) is not int) or (x < 0) or (x > len(docbody)) or (len(docbody)<1):
         ## The provided 'first line' of the reference section was invalid. Either it
         ## was out of bounds in the document body, or it was not a valid integer.
         ## Can't safely find end of refs with this info - quit! exit!
         return None
     ## Get patterns for testing line:
     t_patterns = get_post_reference_section_title_patterns()
     kw_patterns = get_post_reference_section_keyword_patterns()
     
     if None not in (ref_line_marker, ref_line_marker_ptn):
         mk_patterns = [sre.compile(ref_line_marker_ptn, sre.I|sre.UNICODE)]
     else:
         mk_patterns = get_reference_line_numeration_marker_patterns()
     garbage_digit_pattern = sre.compile(unicode(r'^\s*?([\+\-]?\d+?(\.\d+)?\s*?)+?\s*?$'), sre.UNICODE)
 
     while ( x < len(docbody)) and (not section_ended):
         ## look for a likely section title that would follow a reference section:
         end_match = perform_regex_search_upon_line_with_pattern_list(docbody[x], t_patterns)
         if end_match is None:
             ## didn't match a section title - try looking for keywords that suggest the end of a reference section:
             end_match = perform_regex_search_upon_line_with_pattern_list(docbody[x], kw_patterns)
         if end_match is not None:
             ## Is it really the end of the reference section? Check within the next
             ## 5 lines for other reference numeration markers:
             y = x + 1
             line_found = 0
             while (y < x + 6) and ( y < len(docbody)) and (not line_found):
                 num_match = perform_regex_search_upon_line_with_pattern_list(docbody[y], mk_patterns)
                 if num_match is not None and not num_match.group(0).isdigit():
                     line_found = 1
                 y = y + 1
             if not line_found:
                 ## No ref line found-end section
                 section_ended = 1
         if not section_ended:
             ## Does this & the next 5 lines simply contain numbers? If yes, it's probably the axis
             ## scale of a graph in a fig. End refs section
             dm = garbage_digit_pattern.match(docbody[x])
             if dm is not None:
                 y = x + 1
                 digit_lines = 4
                 num_digit_lines = 1
                 while(y < x + digit_lines) and (y < len(docbody)):
                     dm = garbage_digit_pattern.match(docbody[y])
                     if dm is not None:
                         num_digit_lines += 1
                     y = y + 1
                 if num_digit_lines == digit_lines:
                     section_ended = 1
             x = x + 1
     return x - 1
 
 ## ----> 3. Found reference section - now take out lines and rebuild them:
 
 def test_for_blank_lines_separating_reference_lines(ref_sect):
     """Test to see if reference lines are separated by blank lines so that these can be used
        to rebuild reference lines.
        @param ref_sect: (list) of strings - the reference section.
        @return: (int) 0 if blank lines do not separate reference lines; 1 if they do.
     """
     num_blanks = 0            ## Number of blank lines found between non-blanks
     num_lines = 0             ## Number of reference lines separated by blanks
     blank_line_separators = 0 ## Flag to indicate whether blanks lines separate ref lines
     multi_nonblanks_found = 0 ## Flag to indicate whether multiple nonblank lines are found together (used because
                               ## if line is dbl-spaced, it isnt a blank that separates refs & can't be relied upon)
     x = 0
     max_line = len(ref_sect)
     while x < max_line:
         if not ref_sect[x].isspace():
             ## not an empty line:
             num_lines += 1
             x = x + 1 ## Move past line
             while x < len(ref_sect) and not ref_sect[x].isspace():
                 multi_nonblanks_found = 1
                 x = x + 1
             x = x - 1
         else:
             ## empty line
             num_blanks += 1
             x = x + 1
             while x< len(ref_sect) and ref_sect[x].isspace():
                 x = x + 1
             if x == len(ref_sect):
                 ## Blanks at end doc: dont count
                 num_blanks -= 1
             x = x - 1
         x = x + 1
     ## Now from the number of blank lines & the number of text lines, if num_lines > 3, & num_blanks = num_lines,
     ## or num_blanks = num_lines - 1, then we have blank line separators between reference lines
     if (num_lines > 3) and ((num_blanks == num_lines) or (num_blanks == num_lines - 1)) and (multi_nonblanks_found):
         blank_line_separators = 1
     return blank_line_separators
 
 
 def remove_leading_garbage_lines_from_reference_section(ref_sectn):
     """Sometimes, the first lines of the extracted references are completely blank or email addresses.
        These must be removed as they are not references.
        @param ref_sectn: (list) of strings - the reference section lines
        @return: (list) of strings - the reference section without leading blank lines or email addresses.
     """
     p_email = sre.compile(unicode(r'^\s*e\-?mail'), sre.UNICODE)
     while (len(ref_sectn) > 0) and (ref_sectn[0].isspace() or p_email.match(ref_sectn[0]) is not None):
         ref_sectn[0:1] = []
     return ref_sectn
 
 def correct_rebuilt_lines(rebuilt_lines, p_refmarker):
     """Try to correct any cases where a reference line has been incorrectly split based upon
        a wrong numeration marker. That is to say, given the following situation:
 
        [1] Smith, J blah blah
        [2] Brown, N blah blah see reference
        [56] for more info [3] Wills, A blah blah
        ...
 
        The first part of the 3rd line clearly belongs with line 2. This function will try to fix this situation,
        to have the following situation:
 
        [1] Smith, J blah blah
        [2] Brown, N blah blah see reference [56] for more info
        [3] Wills, A blah blah
 
        If it cannot correctly guess the correct break-point in such a line, it will give up and the original
        list of reference lines will be returned.
 
        @param rebuilt_lines: (list) the rebuilt reference lines
        @param p_refmarker: (compiled regex pattern object) the pattern used to match regex line numeration
         markers. **MUST HAVE A GROUP 'marknum' to encapsulate the mark number!** (e.g. r'\[(?P<marknum>\d+)\]')
        @return: (list) of strings. If necessary, the corrected reference lines. Else the orginal 'rebuilt' lines.
     """
     fixed = []
     unsafe = 0
     try:
         m = p_refmarker.match(rebuilt_lines[0])
         last_marknum = int(m.group("marknum"))
         if last_marknum != 1:
             ## Even the first mark isnt 1 - probaby too dangerous to try to repair
             return rebuilt_lines
     except (IndexError, AttributeError, ValueError):
         ## Sometihng went wrong. Either no references, not a numbered line marker (int() failed), or
         ## no reference line marker (NoneType was passed). In any case, unable to test for correct
         ## reference line numberring - just return the lines as they were.
         return rebuilt_lines
 
     ## Loop through each line in "rebuilt_lines" and test the mark at the beginning.
     ## If current-line-mark = previous-line-mark + 1, the line will be taken to be correct and appended
     ## to the list of fixed-lines. If not, then the loop will attempt to test whether the current line
     ## marker is actually part of the previous line by looking in the current line for another marker
     ## that has the numeric value of previous-marker + 1. If found, that marker will be taken as the true
     ## marker for the line and the leader of the line (up to the point of this marker) will be appended
     ## to the revious line. E.g.:
     ## [1] Smith, J blah blah
     ## [2] Brown, N blah blah see reference
     ## [56] for more info [3] Wills, A blah blah
     ## ...
     ##
     ## ...will be transformed into:
     ## [1] Smith, J blah blah
     ## [2] Brown, N blah blah see reference [56] for more info
     ## [3] Wills, A blah blah
     ## ...
     
     ## first line is correct, to put it into fixed:
     fixed.append(rebuilt_lines[0])
-    try:
-        for x in xrange(1, len(rebuilt_lines)):
-            m = p_refmarker.match(rebuilt_lines[x])
-            try:
-                if int(m.group("marknum")) == last_marknum + 1:
-                    ## The marker number for this reference line is correct.
-                    ## Append it to the 'fixed' lines and move on.
-                    fixed.append(rebuilt_lines[x])
-                    last_marknum += 1
-                    continue
-                elif len(rebuilt_lines[x][m.end():].strip()) == 0:
-                    ## This line consists of a marker-number only - it is not a
-                    ## correct marker. Append it to the last line.
-                    fixed[len(fixed) - 1] += rebuilt_lines[x]
-                    continue
-                else:
-                    ## This marker != previous-marker + 1.
-                    ## May have taken some of the last line into this line. Can we find the
-                    ## next marker in this line?
-                    ## Test for this situation:
-                    ## [54] for more info [3] Wills, A blah blah
-                    m_fix = p_refmarker.search(rebuilt_lines[x][m.end():])
-
-                    if m_fix is not None and int(m_fix.group("marknum")) == last_marknum + 1:
-                        ## found next marker in line. Test to see that marker is followed by
-                        ## something meaningful i.e. a letter at least (name).
+    for x in xrange(1, len(rebuilt_lines)):
+        m = p_refmarker.match(rebuilt_lines[x])
+        try:
+            ## Get the number of this line:
+            curline_mark_num = m.group("marknum")
+        except AttributeError:
+            ## This line does not have a line marker at the start.
+            ## Add this line to the end of the previous line.
+            fixed[len(fixed) - 1] += rebuilt_lines[x]
+        else:
+            if int(curline_mark_num) == last_marknum + 1:
+                ## The marker number for this reference line is correct.
+                ## Append it to the 'fixed' lines and move on.
+                fixed.append(rebuilt_lines[x])
+                last_marknum += 1
+            elif len(rebuilt_lines[x][m.end():].strip()) == 0:
+                ## This line consists of a marker-number only - it is not a
+                ## correct marker. Append it to the last line.
+                fixed[len(fixed) - 1] += rebuilt_lines[x]
+            else:
+                ## This marker != previous-marker + 1.
+                ## May have taken some of the last line into this line.
+                ## Can we find the next marker in this line?
+                ## Test for this situation:
+                ## [54] for more info [3] Wills, A blah blah
+                current_line = rebuilt_lines[x]
+                m_next_mark = p_refmarker.search(current_line[m.end():])
+                while m_next_mark is not None:
+                    ## Another "line marker" is present in this line.
+                    ## Test it to see if it is equal to the previous
+                    ## 'real' marker + 1:
+                    if int(m_next_mark.group("marknum")) == \
+                       last_marknum + 1:
+                        ## This seems to be the marker for the next line.
+                        ## Test to see that the marker is followed by
+                        ## something meaningful (a letter at least.)
                         ## I.e. We want to fix this:
                         ## [54] for more info [3] Wills, A blah blah
                         ##
                         ## but we don't want to fix this:
                         ## [54] for more info or even reference [3]
                         ##
                         ## as that would be unsafe.
                         m_test_nxt_mark_not_eol = \
-                          sre.search(sre.escape(m_fix.group()) + '\s*[A-Za-z]', rebuilt_lines[x])
+                          sre.search(sre.escape(m_next_mark.group()) \
+                                     + '\s*[A-Za-z]', current_line)
                         if m_test_nxt_mark_not_eol is not None:
                             ## move this section back to its real line:
 
                             ## get the segment of this line to be moved to the previous line
                             ## (append a newline to it too):
-                            movesect = rebuilt_lines[x][0:m_test_nxt_mark_not_eol.start()] + "\n"
+                            movesect = current_line[0:m_test_nxt_mark_not_eol.start()] + "\n"
 
                             ## Now get the previous line into a variable (without its newline at the end):
                             previous_line = fixed[len(fixed) - 1].rstrip("\n")
 
                             ## Now append the section to be moved to the previous line variable.
                             ## Check the last character of the previous line. If it's a space, then
                             ## just directly append this new section. Else, append a space then this new section.
-                            previous_line += "%s%s" % ((previous_line[len(previous_line) - 1] != " " and " ") or (""), movesect)
+                            previous_line += movesect
 
                             fixed[len(fixed) - 1] = previous_line
 
                             ## Now append the remainder of the current line to the list of fixed lines, and move on to the
                             ## next line:
-                            fixed.append(rebuilt_lines[x][m_test_nxt_mark_not_eol.start():])
-                            
+                            fixed.append(current_line[m_test_nxt_mark_not_eol.start():])
+
                             last_marknum += 1
-                            continue
-                        else:
-                            ## The next marker in the line was not followed by text. It is unsafe to move it.
-                            ## Give up trying to correct these reference lines - it's not safe to continue.
-                            unsafe = 1
                             break
+                        else:
+                            ## The next 'marker' in this line was not followed by text.
+                            ## take from the beginning of this line, to the end of this
+                            ## marker, and append it to the end of the previous line:
+                            previous_line = fixed[len(fixed) - 1].rstrip("\n")
+                            movesect = current_line[0:m_next_mark.end()] + "\n"
+                            ## Now append the section to be moved to the previous line variable.
+                            ## Check the last character of the previous line. If it's a space, then
+                            ## just directly append this new section. Else, append a space then this new section.
+                            previous_line += movesect
+                            fixed[len(fixed) - 1] = previous_line
+                            current_line = current_line[m_next_mark.end():]
+
                     else:
-                        ## Unable to find another marker in the line that starts with the incorrect marker.
-                        ## It is therefore unsafe to attempt to correct the lines: just return the original lines.
-                        unsafe = 1
-                        break
-            except AttributeError:
-                ## This line does not have a line marker at the start! This line shall be added to the end of the previous line.
-                fixed[len(fixed) - 1] += rebuilt_lines[x]
-                continue
-    except IndexError:
-        ## Somewhere, the boundaries of the list of references were over-stepped. Just return the original set of reference lines.
-        unsafe = 1
-    if unsafe:
-        ## return the original set of references.
-        return rebuilt_lines
-    else:
-        ## return the newly corrected references.
-        return fixed
+                        ## This 'marker' is false - its value is not equal to
+                        ## the previous marker + 1
+                        previous_line = fixed[len(fixed) - 1].rstrip("\n")
+                        movesect = current_line[0:m_next_mark.end()] + "\n"
+                        ## Now append the section to be moved to the previous line variable.
+                        ## Check the last character of the previous line. If it's a space, then
+                        ## just directly append this new section. Else, append a space then this new section.
+                        previous_line += movesect
+                        fixed[len(fixed) - 1] = previous_line
+                        current_line = current_line[m_next_mark.end():]
+
+
+                        
+                    ## Get next match:
+                    m_next_mark = p_refmarker.search(current_line)
+
+                ## If there was still some of the "current line" left,
+                ## append it to the previous line:
+                if len(current_line.strip()) > 0:
+                    previous_line = fixed[len(fixed) - 1].rstrip("\n")
+                    movesect = current_line
+                    ## Now append the section to be moved to the previous line variable.
+                    ## Check the last character of the previous line. If it's a space, then
+                    ## just directly append this new section. Else, append a space then this new section.
+                    previous_line += movesect
+                    fixed[len(fixed) - 1] = previous_line
+
+    return fixed
+
 
 def wash_and_repair_reference_line(line):
     """Wash a reference line of undesirable characters (such as poorly-encoded letters, etc),
        and repair any errors (such as broken URLs) if possible.
        @param line: (string) the reference line to be washed/repaired.
        @return: (string) the washed reference line.
     """
     ## repair URLs in line:
     line = repair_broken_urls(line)
     ## Replace various undesirable characters with their alternatives:
     line = replace_undesirable_characters(line)
     ## remove instances of multiple spaces from line, replacing with a single space:
     line = sre_multiple_space.sub(u' ', line)
     return line
 
 def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
     """Given a reference section, rebuild the reference lines. After translation from PDF to text,
        reference lines are often broken. This is because pdftotext doesn't know what is a wrapped-
        line and what is a genuine new line. As a result, the following 2 reference lines:
         [1] See http://cdsware.cern.ch/ for more details.
         [2] Example, AN: private communication (1996).
        ...could be broken into the following 4 lines during translation from PDF to plaintext:
         [1] See http://cdsware.cern.ch/ fo
         r more details.
         [2] Example, AN: private communica
         tion (1996).
        Such a situation could lead to a citation being separated across 'lines', meaning that it
        wouldn't be correctly recognised.
        This function tries to rebuild the reference lines. It uses the pattern used to recognise a
        reference line's numeration marker to indicate the start of a line. If no reference line
        numeration was recognised, it will simply join all lines together into one large reference line.
        @param ref_sectn: (list) of strings. The (potentially broken) reference lines.
        @param ref_line_marker_ptn: (string) - the pattern used to recognise a reference line's
         numeration marker.
        @return: (list) of strings - the rebuilt reference section. Each string in the list
         represents a complete reference line.
     """
     ## initialise some vars:
     rebuilt_references = []
     working_line = u''
 
     len_ref_sectn = len(ref_sectn)
     
     if ref_line_marker_ptn is None or type(ref_line_marker_ptn) not in (str, unicode):
         if test_for_blank_lines_separating_reference_lines(ref_sectn):
             ## Use blank lines to separate ref lines
             ref_line_marker_ptn = unicode(r'^\s*$')
         else:
             ## No ref line dividers: unmatchable pattern
             ref_line_marker_ptn = unicode(r'^A$^A$$')
     p_ref_line_marker = sre.compile(ref_line_marker_ptn, sre.I|sre.UNICODE)
 
     for x in xrange(len_ref_sectn - 1, -1, -1):
         current_string = ref_sectn[x].strip()
         m_ref_line_marker = p_ref_line_marker.match(current_string)
         if m_ref_line_marker is not None:
             ## Ref line start marker
             if current_string == '':
                 ## Blank line to separate refs. Append the current working line to the refs list
                 working_line = working_line.rstrip()
                 working_line = wash_and_repair_reference_line(working_line)
                 rebuilt_references.append(working_line)
                 working_line = u''
             else:
                 if current_string[len(current_string) - 1] in (u'-', u' '):
                     ## space or hyphenated word at the end of the line - don't add in a space
                     working_line = current_string + working_line
                 else:
                     ## no space or hyphenated word at the end of this line - add in a space
                     working_line = current_string + u' ' + working_line
                 working_line = working_line.rstrip()
                 working_line = wash_and_repair_reference_line(working_line)
                 rebuilt_references.append(working_line)
                 working_line = u''
         else:
             if current_string != u'':
                 ## Continuation of line
                 if current_string[len(current_string) - 1] in (u'-', u' '):
                     ## space or hyphenated word at the end of the line - don't add in a space
                     working_line = current_string + working_line
                 else:
                     ## no space or hyphenated word at the end of this line - add in a space
                     working_line = current_string + u' ' + working_line
     
     if working_line != u'':
         ## Append last line
         working_line = working_line.rstrip()
         working_line = wash_and_repair_reference_line(working_line)
         rebuilt_references.append(working_line)
 
     ## a list of reference lines has been built backwards - reverse it:
     rebuilt_references.reverse()
 
     rebuilt_references = correct_rebuilt_lines(rebuilt_references, p_ref_line_marker)
     return rebuilt_references
 
 def get_reference_lines(docbody, ref_sect_start_line, ref_sect_end_line, \
                         ref_sect_title, ref_line_marker_ptn, title_marker_same_line):
     """After the reference section of a document has been identified, and the first and last lines
        of the reference section have been recorded, this function is called to take the reference
        lines out of the document body. The document's reference lines are returned in a list of
        strings whereby each string is a reference line. Before this can be done however, the
        reference section is passed to another function that rebuilds any broken reference lines.
        @param docbody: (list) of strings - the entire document body.
        @param ref_sect_start_line: (integer) - the index in docbody of the first reference line.
        @param ref_sect_end_line: (integer) - the index in docbody of the last reference line.
        @param ref_sect_title: (string) - the title of the reference section (e.g. "References").
        @param ref_line_marker_ptn: (string) - the patern used to match the marker for each
         reference line (e.g., could be used to match lines with markers of the form [1], [2], etc.)
        @param title_marker_same_line: (integer) - a flag to indicate whether or not the reference
         section title was on the same line as the first reference line's marker.
        @return: (list) of strings. Each string is a reference line, extracted from the document.
     """
     start_idx = ref_sect_start_line
     if title_marker_same_line:
         ## Title on same line as 1st ref- take title out!
         title_start = docbody[start_idx].find(ref_sect_title)
         if title_start != -1:
             docbody[start_idx] = docbody[start_idx][title_start + len(ref_sect_title):]
     elif ref_sect_title is not None:
         ## Pass title line
         start_idx += 1
 
     ## now rebuild reference lines:
     if type(ref_sect_end_line) is int:
         ref_lines = rebuild_reference_lines(docbody[start_idx:ref_sect_end_line+1], ref_line_marker_ptn)
     else:
         ref_lines = rebuild_reference_lines(docbody[start_idx:], ref_line_marker_ptn)
     return ref_lines
 
 
 ## ----> Glue - logic for finding and extracting reference section:
 
 def extract_references_from_fulltext(fulltext):
     """Locate and extract the reference section from a fulltext document.
        Return the extracted reference section as a list of strings, whereby each
        string in the list is considered to be a single reference line.
         E.g. a string could be something like:
         '[19] Wilson, A. Unpublished (1986).
        @param fulltext: (list) of strings, whereby each string is a line of the document.
        @return: (list) of strings, where each string is an extracted reference line.
     """
     ## Try to remove pagebreaks, headers, footers
     fulltext = remove_page_boundary_lines(fulltext)
     status = 0
     ## Find start of refs section:
     ref_sect_start = find_reference_section(fulltext)
     if ref_sect_start is None:
         ## No references found - try with no title option
         ref_sect_start = find_reference_section_no_title(fulltext)
     if ref_sect_start is None:
         ## No References
         refs = []
         status = 4
         if cli_opts['verbosity'] >= 1:
             sys.stdout.write("-----extract_references_from_fulltext: ref_sect_start is None\n")
     else:
         ref_sect_end = find_end_of_reference_section(fulltext, ref_sect_start["start_line"], \
                                                      ref_sect_start["marker"], ref_sect_start["marker_pattern"])
         if ref_sect_end is None:
             ## No End to refs? Not safe to extract
             refs = []
             status = 5
             if cli_opts['verbosity'] >= 1:
                 sys.stdout.write("-----extract_references_from_fulltext: no end to refs!\n")
         else:
             ## Extract
             refs = get_reference_lines(fulltext, ref_sect_start["start_line"], ref_sect_end, \
                                        ref_sect_start["title_string"], ref_sect_start["marker_pattern"], \
                                        ref_sect_start["title_marker_same_line"])
     return (refs, status)
 
 
 ## Tasks related to conversion of full-text to plain-text:
 
 def _pdftotext_conversion_is_bad(txtlines):
     """Sometimes pdftotext performs a bad conversion which consists of many spaces and garbage characters.
        This method takes a list of strings obtained from a pdftotext conversion and examines them to see if
        they are likely to be the result of a bad conversion.
        @param txtlines: (list) of unicode strings obtained from pdftotext conversion.
        @return: (integer) - 1 if bad conversion; 0 if good conversion.
     """
     ## Numbers of 'words' and 'whitespaces' found in document:
     numWords = numSpaces = 0
     ## whitespace character pattern:
     p_space = sre.compile(unicode(r'(\s)'), sre.UNICODE)
     ## non-whitespace 'word' pattern:
     p_noSpace = sre.compile(unicode(r'(\S+)'), sre.UNICODE)
     for txtline in txtlines:
         numWords = numWords + len(p_noSpace.findall(txtline))
         numSpaces = numSpaces + len(p_space.findall(txtline))
     if numSpaces >= (numWords * 3):
         ## Too many spaces - probably bad conversion
         return 1
     else:
         return 0
 
 def convert_PDF_to_plaintext(fpath):
     """Take the path to a PDF file and run pdftotext for this file, capturing the
        output.
        @param fpath: (string) path to the PDF file
        @return: (list) of unicode strings (contents of the PDF file translated into plaintext;
         each string is a line in the document.)
     """
     status = 0
     doclines = []
     ## build pdftotext command:
-    cmd_pdftotext = """%(pdftotext)s -raw -q -enc UTF-8 %(filepath)s -""" % { 'pdftotext' : CFG_PATH_PDFTOTEXT,
-                                                                              'filepath'  : fpath }
+    cmd_pdftotext = """%(pdftotext)s -raw -q -enc UTF-8 '%(filepath)s' -""" \
+                    % { 'pdftotext' : CFG_PATH_PDFTOTEXT,
+                        'filepath'  : fpath.replace("'", "\\'")
+                      }
     if cli_opts['verbosity'] >= 1:
         sys.stdout.write("%s\n" % cmd_pdftotext)
     ## open pipe to pdftotext:
     pipe_pdftotext = os.popen("%s" % cmd_pdftotext, 'r')
     ## read back results:
     count = 0
     for docline in pipe_pdftotext:
         doclines.append(docline.decode("utf-8"))
         count += 1
     ## close pipe to pdftotext:
     pipe_pdftotext.close()
     if cli_opts['verbosity'] >= 1:
         sys.stdout.write("-----convert_PDF_to_plaintext found: %s lines of text\n" \
                          % str(count))
 
     ## finally, check conversion result not bad:
     if _pdftotext_conversion_is_bad(doclines):
         status = 2
         doclines = []
     return (doclines, status)
 
-def convert_document_to_plaintext(fpath):
-    """Given the path to a file, convert it to plaintext and return the content as a list, whereby
-       each line of text in the document file is a string in the list.
-       @param fpath: (string) the path to the file to be converted to text
-       @return: list of strings (the plaintext body of the file)
-    """
-    doc_plaintext = []
-    pipe_gfile = os.popen("%s %s" % (CFG_PATH_GFILE, fpath), "r")
-    res_gfile = pipe_gfile.readline()
-    pipe_gfile.close()
-    status = 0
-    if res_gfile.lower().find("pdf") != -1:
-        ## convert from PDF
-        (doc_plaintext, status) = convert_PDF_to_plaintext(fpath)
-    return (doc_plaintext, status)
-
 def get_plaintext_document_body(fpath):
     """Given a file-path to a full-text, return a list of unicode strings whereby each string
        is a line of the fulltext.
        In the case of a plain-text document, this simply means reading the contents in from the
        file.  In the case of a PDF/PostScript however, this means converting the document to
        plaintext.
        @param: fpath: (string) - the path to the fulltext file
        @return: (list) of strings - each string being a line in the document.
     """
     textbody = []
     status = 0
     if os.access(fpath, os.F_OK|os.R_OK):
         # filepath OK - attempt to extract references:
         ## get file type:
-        pipe_gfile = os.popen("%s %s" % (CFG_PATH_GFILE, fpath), "r")
+        pipe_gfile = os.popen("%s '%s'" \
+                              % (CFG_PATH_GFILE, fpath.replace("'", "\\'")), "r")
         res_gfile = pipe_gfile.readline()
         pipe_gfile.close()
         
         if res_gfile.lower().find("text") != -1 and \
-           res_gfile.lower().find("postscript") == -1 and \
            res_gfile.lower().find("pdf") == -1:
             ## plain-text file: don't convert - just read in:
             #textbody = open("%s" % fpath, "r").readlines()
             textbody = []
             for line in open("%s" % fpath, "r").readlines():
                 textbody.append(line.decode("utf-8"))
-        else:
-            ## assume file needs to be converted to text:
-            (textbody, status) = convert_document_to_plaintext(fpath)
+        elif res_gfile.lower().find("pdf") != -1:
+            ## convert from PDF
+            (textbody, status) = convert_PDF_to_plaintext(fpath)
     else:
         ## filepath not OK
         status = 1
 ##        raise IOError("Could not find file %s" % fpath)
     return (textbody, status)
 
 def write_raw_references_to_stream(recid, raw_refs, strm=None):
     """Write a lost of raw reference lines to the a given stream.
        Each reference line is preceeded by the record-id. Thus, if for example,
        the following 2 reference lines were passed to this function:
         [1] See http://cdsware.cern.ch/ for more details.
         [2] Example, AN: private communication (1996).
        and the record-id was "1", the raw reference lines printed to the stream would be:
         1:[1] See http://cdsware.cern.ch/ for more details.
         1:[2] Example, AN: private communication (1996).
        @param recid: (string) the record-id of the document for which raw references are
         to be written-out.
        @param raw_refs: (list) of strings. The raw references to be written-out.
        @param strm: (open stream object) - the stream object to which the references are to be
         written. If the stream object is not a valid open stream (or is None, by default), the
         standard error stream (sys.stderr) will be used by default.
        @return: None.
     """
     if strm is None or type(strm) is not file:
         ## invalid stream supplied - write to sys.stderr
         strm = sys.stderr
     elif strm.closed:
         ## The stream was closed - use stderr:
         strm = sys.stderr
     ## write the reference lines to the stream:
     strm.writelines(map(lambda x: "%(recid)s:%(refline)s\n" \
                         % { 'recid' : recid, 'refline' : x.encode("utf-8") }, raw_refs))
     strm.flush()
 
 def usage(wmsg="", err_code=0):
     """Display a usage message for refextract on the standard error stream and then exit.
        @param wmsg: (string) - some kind of warning message for the user.
        @param err_code: (integer) - an error code to be passed to sys.exit, which is called
         after the usage message has been printed.
        @return: None.
     """
     if wmsg != "":
         wmsg = wmsg.strip() + "\n"
     msg = """Usage: refextract [options] recid:file1 [recid:file2 ...]
   refextract tries to extract the reference section from a full-text document.
   Extracted reference lines are processed and any recognised citations are marked
   up using MARC XML. Results are output to the standard output stream.
   
   Options: 
    -h, --help     print this help
    -V, --version  print version information
    -v, --verbose  verbosity level (0=mute, 1=default info msg,
 		  2=display reference section extraction analysis,
                   3=display reference line citation processing analysis, 
 		  9=max information)
    -r, --output-raw-refs
                   output raw references, as extracted from the document. No MARC XML
                   mark-up - just each extracted line, prefixed by the recid of the document
                   that it came from.
    -x, --xmlfile
                   write xml output to a file rather than standard out
    -z, --raw-references
                   treat the input file as pure references. i.e. skip the stage of trying to
                   locate the reference section within a document and instead move to the
                   stage of recognition and standardisation of citations within lines.
 
   Example: refextract 499:thesis.pdf
 """
     sys.stderr.write(wmsg + msg)
     sys.exit(err_code)
 
 def get_cli_options():
     """Get the various arguments and options from the command line and populate
        a dictionary of cli_options.
        @return: (tuple) of 2 elements. First element is a dictionary of cli options and
         flags, set as appropriate; Second element is a list of cli arguments.
     """
     global cli_opts
     ## dictionary of important flags and values relating to cli call of program:
     cli_opts = { 'treat_as_reference_section' : 0,
                  'output_raw'                 : 0,
                  'verbosity'                  : 0,
                  'xmlfile'                    : 0,
                }
 
     try:
         myoptions, myargs = getopt.getopt(sys.argv[1:], "hVv:zrx:", \
                                           ["help",
                                            "version",
                                            "verbose=",
                                            "raw-references",
                                            "output-raw-refs",
                                            "xmlfile="])
     except getopt.GetoptError, err:
         ## Invalid option provided - usage message
         usage(wmsg="Error: %(msg)s." % { 'msg' : str(err) })
 
     for o in myoptions:
         if o[0] in ("-V","--version"):
             ## version message and exit
             sys.stdout.write("%s\n" % __revision__)
             sys.stdout.flush()
             sys.exit(0)
         elif o[0] in ("-h","--help"):
             ## help message and exit
             usage()
         elif o[0] in ("-r", "--output-raw-refs"):
             cli_opts['output_raw'] = 1
         elif o[0] in ("-v", "--verbose"):
             if not o[1].isdigit():
                 cli_opts['verbosity'] = 0
             elif int(o[1]) not in xrange(0, 10):
                 cli_opts['verbosity'] = 0
             else:
                 cli_opts['verbosity'] = int(o[1])
         elif o[0] in ("-z", "--raw-citations"):
             ## treat input as pure reference lines:
             cli_opts['treat_as_reference_section'] = 1
         elif o[0] in ("-x", "--xmlfile"):
             cli_opts['xmlfile'] = o[1]
 
     if len(myargs) == 0:
         ## no arguments: error message
         usage(wmsg="Error: no full-text.")
 
     return (cli_opts, myargs)
 
 def display_xml_record(status_code, count_reportnum,
                        count_title, count_url, count_misc, recid, xml_lines):
     """Given a series of MARC XML-ized reference lines and a record-id, write a
        MARC XML record to the stdout stream. Include in the record some stats for
        the extraction job.
        The printed MARC XML record will essentially take the following structure:
         <record>
            <controlfield tag="001">1</controlfield>
            <datafield tag="999" ind1="C" ind2="5">
               ...
            </datafield>
            [...]
            <datafield tag="999" ind1="C" ind2="6">
               <subfield code="a">
                 CDS Invenio/X.XX.X refextract/X.XX.X-timestamp-error-reportnum-title-URL-misc
               </subfield>
            </datafield>
         </record>
        Timestamp, error(code), reportnum, title, URL, and misc will are of course take
        the relevant values.
 
        @param status_code: (integer)the status of reference-extraction for the given
         record: was there an error or not? 0 = no error; 1 = error.
        @param count_reportnum: (integer) - the number of institutional report-number
         citations found in the document's reference lines.
        @param count_title: (integer) - the number of journal title citations found
         in the document's reference lines.
        @param count_url: (integer) - the number of URL citations found in the
         document's reference lines.
        @param count_misc: (integer) - the number of sections of miscellaneous text
         (i.e. 999C5$m) from the document's reference lines.
        @param recid: (string) - the record-id of the given document. (put into 001
         field.)
        @param xml_lines: (list) of strings. Each string in the list contains a group
         of MARC XML 999C5 dtafields, making up a single reference line. These reference
         lines will make up the document body.
        @return: None
     """
     ## Start with the opening record tag:
     out = u"%(record-open)s\n" % { 'record-open' : CFG_REFEXTRACT_XML_RECORD_OPEN, }
 
     ## Display the record-id controlfield:
     out += u"""   <controlfield tag="%(cf-tag-recid)s">%(recid)s</controlfield>\n""" \
             % { 'cf-tag-recid' : CFG_REFEXTRACT_CTRL_FIELD_RECID,
                 'recid'        : encode_for_xml(recid),
               }
 
     ## Loop through all xml lines and add them to the output string:
     for line in xml_lines:
         out += line
 
     ## add the 999C6 status subfields:
     out += u"""   <datafield tag="%(df-tag-ref-stats)s" ind1="%(df-ind1-ref-stats)s" ind2="%(df-ind2-ref-stats)s">
       <subfield code="%(sf-code-ref-stats)s">%(version)s-%(timestamp)s-%(status)s-%(reportnum)s-%(title)s-%(url)s-%(misc)s</subfield>
    </datafield>\n""" % { 'df-tag-ref-stats'  : CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS,
                          'df-ind1-ref-stats' : CFG_REFEXTRACT_IND1_EXTRACTION_STATS,
                          'df-ind2-ref-stats' : CFG_REFEXTRACT_IND2_EXTRACTION_STATS,
                          'sf-code-ref-stats' : CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS,
                          'version'           : CFG_REFEXTRACT_VERSION,
                          'timestamp'         : str(int(mktime(localtime()))),
                          'status'            : status_code,
                          'reportnum'         : count_reportnum,
                          'title'             : count_title,
                          'url'               : count_url,
                          'misc'              : count_misc,
                        }
 
     ## Now add the closing tag to the record:
     out += u"%(record-close)s\n" % { 'record-close' : CFG_REFEXTRACT_XML_RECORD_CLOSE, }
 
     return out
 
 
 def main():
     """Main function.
     """
     global cli_opts
     (cli_opts, cli_args) =  get_cli_options()
 
     extract_jobs = get_recids_and_filepaths(cli_args)
     if len(extract_jobs) == 0:
         ## no files provided for reference extraction - error message
         usage()
 
     ## Read the journal titles knowledge base, creating the search patterns and replace terms:
     (title_search_kb, title_search_standardised_titles, title_search_keys) = \
                      build_titles_knowledge_base(CFG_REFEXTRACT_KB_JOURNAL_TITLES)
     (preprint_reportnum_sre, standardised_preprint_reportnum_categs) = \
                      build_institutes_preprints_numeration_knowledge_base(CFG_REFEXTRACT_KB_REPORT_NUMBERS)
 
     done_coltags = 0 ## flag to signal that the XML collection tags have been output
 
     for curitem in extract_jobs:
         extract_error = 0  ## extraction was OK unless determined otherwise
         ## reset the stats counters:
         count_misc = count_title = count_reportnum = count_url = 0
         recid = curitem[0]
         if cli_opts['verbosity'] >= 1:
             sys.stdout.write("--- processing RecID: %s pdffile: %s\n" \
                              % (str(curitem[0]), curitem[1]))
 
         if not done_coltags:
             ## Output opening XML collection tags:
             if cli_opts['xmlfile']:
                 try:
                     ofilehdl = open(cli_opts['xmlfile'], 'w')
                     ofilehdl.write("%s\n" % CFG_REFEXTRACT_XML_VERSION.encode("utf-8"))
                     ofilehdl.write("%s\n" % CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8"))
                     ofilehdl.flush()
                 except:
                     sys.stdout.write("***%s\n\n" % cli_opts['xmlfile'])
                     raise IOError("Cannot open %s to write!" % cli_opts['xmlfile'])
             else:
                 sys.stdout.write("%s\n" % (CFG_REFEXTRACT_XML_VERSION.encode("utf-8"),))
                 sys.stdout.write("%s\n" % (CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8"),))
             done_coltags = 1
 
         ## 1. Get this document body as plaintext:
         (docbody, extract_error) = get_plaintext_document_body(curitem[1])
         if extract_error == 0 and len(docbody) == 0:
             extract_error = 3
         if cli_opts['verbosity'] >= 1:
             sys.stdout.write("-----get_plaintext_document_body gave: %s lines," \
                              " overall error: %s\n" % (str(len(docbody)), str(extract_error)))
         if len(docbody) > 0:
             ## the document body is not empty:
             ## 2. If necessary, locate the reference section:
             if cli_opts['treat_as_reference_section']:
                 ## don't search for citations in the document body: treat it as a reference section:
                 reflines = docbody
             else:
                 ## launch search for the reference section in the document body:
                 (reflines, extract_error) = extract_references_from_fulltext(docbody)
                 if len(reflines) == 0 and extract_error == 0:
                     extract_error = 6
                 if cli_opts['verbosity'] >= 1:
                     sys.stdout.write("-----extract_references_from_fulltext gave " \
                                      "len(reflines): %s overall error: %s\n" \
                                      % (str(len(reflines)), str(extract_error)))
 
             ## 3. Standardise the reference lines:
 #            reflines = test_get_reference_lines()
             (processed_references, count_misc, \
              count_title, count_reportnum, count_url) = \
               create_marc_xml_reference_section(reflines,
                                                 preprint_repnum_search_kb=preprint_reportnum_sre,
                                                 preprint_repnum_standardised_categs=\
                                                       standardised_preprint_reportnum_categs,
                                                 periodical_title_search_kb=title_search_kb,
                                                 standardised_periodical_titles=title_search_standardised_titles,
                                                 periodical_title_search_keys=title_search_keys)
         else:
             ## document body is empty, therefore the reference section is empty:
             reflines = []
             processed_references = []
 
         ## 4. Display the extracted references, status codes, etc:
         if cli_opts['output_raw']:
             ## now write the raw references to the stream:
             raw_file = str(recid) + '.rawrefs'
             try:
                 rawfilehdl = open(raw_file, 'w')
                 write_raw_references_to_stream(recid, reflines, rawfilehdl)
                 rawfilehdl.close()
             except:
                 raise IOError("Cannot open raw ref file: %s to write" % raw_file)
 
         ## Display the processed reference lines:
         out = display_xml_record(extract_error, count_reportnum,
                            count_title, count_url, count_misc, recid, processed_references)
         if cli_opts['verbosity'] >= 1:
             lines = out.split('\n')
             sys.stdout.write("-----display_xml_record gave: %s significant lines " \
                              "of xml, overall error: %s\n" % (str(len(lines) - 7), extract_error))
         if cli_opts['xmlfile']:
             ofilehdl.write("%s" % (out.encode("utf-8"),))
             ofilehdl.flush()
         else:
             ## Write the record to the standard output stream:
             sys.stdout.write("%s" % out.encode("utf-8"))
             sys.stdout.flush()
 
     ## If an XML collection was opened, display closing tag
     if done_coltags:
         if (cli_opts['xmlfile']):
             ofilehdl.write("%s\n" % CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8"))
             ofilehdl.close()
         else:
             sys.stdout.write("%s\n" % CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8"))
 
 
 def test_get_reference_lines():
     """Returns some test reference lines.
        @return: (list) of strings - the test reference lines. Each
         string in the list is a reference line that should be processed.
     """
     reflines = ["""[1] J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231; hep-th/9711200. http://cdsweb.cern.ch/""",
                 """[2] S. Gubser, I. Klebanov and A. Polyakov, Phys. Lett. B428 (1998) 105; hep-th/9802109. http://cdsweb.cern.ch/search.py?AGE=hello-world&ln=en""",
                 """[3] E. Witten, Adv. Theor. Math. Phys. 2 (1998) 253; hep-th/9802150.""",
                 """[4] O. Aharony, S. Gubser, J. Maldacena, H. Ooguri and Y. Oz, hep-th/9905111.""",
                 """[5] L. Susskind, J. Math. Phys. 36 (1995) 6377; hep-th/9409089.""",
                 """[6] L. Susskind and E. Witten, hep-th/9805114.""",
                 """[7] W. Fischler and L. Susskind, hep-th/9806039; N. Kaloper and A. Linde, Phys. Rev. D60 (1999) 105509, hep-th/9904120.""",
                 """[8] R. Bousso, JHEP 9906:028 (1999); hep-th/9906022.""",
                 """[9] R. Penrose and W. Rindler, Spinors and Spacetime, volume 2, chapter 9 (Cambridge University Press, Cambridge, 1986).""",
                 """[10] R. Britto-Pacumio, A. Strominger and A. Volovich, JHEP 9911:013 (1999); hep-th/9905211. blah hep-th/9905211 blah hep-ph/9711200""",
                 """[11] V. Balasubramanian and P. Kraus, Commun. Math. Phys. 208 (1999) 413; hep-th/9902121.""",
                 """[12] V. Balasubramanian and P. Kraus, Phys. Rev. Lett. 83 (1999) 3605; hep-th/9903190.""",
                 """[13] P. Kraus, F. Larsen and R. Siebelink, hep-th/9906127.""",
                 """[14] L. Randall and R. Sundrum, Phys. Rev. Lett. 83 (1999) 4690; hep-th/9906064. this is a test RN of a different type: CERN-LHC-Project-Report-2006-003. more text.""",
                 """[15] S. Gubser, hep-th/9912001.""",
                 """[16] H. Verlinde, hep-th/9906182; H. Verlinde, hep-th/9912018; J. de Boer, E. Verlinde and H. Verlinde, hep-th/9912012.""",
                 """[17] E. Witten, remarks at ITP Santa Barbara conference, "New dimensions in field theory and string theory": http://www.itp.ucsb.edu/online/susyc99/discussion/.""",
                 """[18] D. Page and C. Pope, Commun. Math. Phys. 127 (1990) 529.""",
                 """[19] M. Duff, B. Nilsson and C. Pope, Physics Reports 130 (1986), chapter 9.""",
                 """[20] D. Page, Phys. Lett. B79 (1978) 235.""",
                 """[21] M. Cassidy and S. Hawking, Phys. Rev. D57 (1998) 2372, hep-th/9709066; S. Hawking, Phys. Rev. D52 (1995) 5681.""",
                 """[22] K. Skenderis and S. Solodukhin, hep-th/9910023.""",
                 """[23] M. Henningson and K. Skenderis, JHEP 9807:023 (1998), hep-th/9806087.""",
                 """[24] C. Fefferman and C. Graham, "Conformal Invariants", in Elie Cartan et les Mathematiques d'aujourd'hui (Asterisque, 1985) 95.""",
                 """[25] C. Graham and J. Lee, Adv. Math. 87 (1991) 186. <a href="http://cdsweb.cern.ch/">CERN Document Server</a>""",
                 """[26] E. Witten and S.-T. Yau, hep-th/9910245.""",
                 """[27] R. Emparan, JHEP 9906:036 (1999); hep-th/9906040.""",
                 """[28] A. Chamblin, R. Emparan, C. Johnson and R. Myers, Phys. Rev. D59 (1999) 64010, hep-th/9808177; S. Hawking, C. Hunter and D. Page, Phys. Rev. D59 (1999) 44033, hep-th/9809035.""",
                 """[29] S. Sethi and L. Susskind, Phys. Lett. B400 (1997) 265, hep-th/9702101; T. Banks and N. Seiberg, Nucl. Phys. B497 (1997) 41, hep-th/9702187.""",
                 """[30] R. Emparan, C. Johnson and R. Myers, Phys. Rev. D60 (1999) 104001; hep-th/9903238.""",
                 """[31] S. Hawking, C. Hunter and M. Taylor-Robinson, Phys. Rev. D59 (1999) 064005; hep-th/9811056.""",
                 """[32] J. Dowker, Class. Quant. Grav. 16 (1999) 1937; hep-th/9812202.""",
                 """[33] J. Brown and J. York, Phys. Rev. D47 (1993) 1407.""",
                 """[34] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A 546 (1999) 96""",
                 """[35] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A""",
                 """[36] whatever http://cdsware.cern.ch/""",
                 """[37] some misc  lkjslkdjlksjflksj [hep-th/9804058] lkjlkjlkjlkj [hep-th/0001567], hep-th/1212321, some more misc, Nucl. Phys. B546 (1999) 96""",
                 """[38] R. Emparan, C. Johnson and R.... Myers, Phys. Rev. D60 (1999) 104001; this is :: .... misc! hep-th/9903238. and some ...,.,.,.,::: more hep-ph/9912000""",
                 ]
     return reflines