No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 28, 15:04

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/.gitignore b/.gitignore
	index a92ad6b2e..bd4ce6bb9 100644
	--- a/.gitignore
	+++ b/.gitignore
	@@ -1,97 +1,98 @@
	.version
	Makefile
	Makefile.in
	configure
	config.cache
	config.log
	config.status
	config.nice
	config.guess
	config.sub
	install-sh
	missing
	compile
	autom4te.cache
	aclocal.m4
	TAGS
	invenio-autotools.conf
	po/POTFILES
	po/POTFILES-py
	po/POTFILES-webdoc
	po/stamp-po
	po/*.gmo
	po/*.mo
	po/*.sed
	*~
	*.pyc
	*.clisp.mem
	*.cmucl.core
	*.sbcl.core
	*.fas
	*.fasl
	*.sse2f
	*.lib
	*.x86f
	modules/webauthorprofile/bin/webauthorprofile
	modules/bibauthorid/bin/bibauthorid
	modules/bibcirculation/bin/bibcircd
	modules/bibclassify/bin/bibclassify
	modules/bibconvert/bin/bibconvert
	modules/bibdocfile/bin/bibdocfile
	modules/bibedit/bin/bibedit
	modules/bibrecord/bin/textmarc2xmlmarc
	modules/bibrecord/bin/xmlmarc2textmarc
	modules/bibrecord/bin/xmlmarclint
	modules/docextract/bin/refextract
	modules/docextract/bin/docextract
	+modules/docextract/bin/convert_journals
	modules/bibencode/bin/bibencode
	modules/bibexport/bin/bibexport
	modules/bibformat/bin/bibreformat
	modules/oaiharvest/bin/oaiharvest
	modules/oairepository/bin/oairepositoryupdater
	modules/bibindex/bin/bibindex
	modules/bibindex/bin/bibstat
	modules/bibmatch/bin/bibmatch
	modules/bibrank/bin/bibrank
	modules/bibrank/bin/bibrankgkb
	modules/bibrank/etc/bibrankgkb.cfg
	modules/bibrank/etc/demo_jif.cfg
	modules/bibrank/etc/template_single_tag_rank_method.cfg
	modules/bibsched/bin/bibsched
	modules/bibsched/bin/bibtaskex
	modules/bibsched/bin/bibtasklet
	modules/bibsort/bin/bibsort
	modules/bibsword/bin/bibsword
	modules/bibupload/bin/batchuploader
	modules/bibupload/bin/bibupload
	modules/elmsubmit/bin/elmsubmit
	modules/elmsubmit/etc/elmsubmit.cfg
	modules/miscutil/bin/dbdump
	modules/miscutil/bin/dbexec
	modules/miscutil/bin/inveniocfg
	modules/miscutil/bin/plotextractor
	modules/miscutil/etc/bash_completion.d/inveniocfg
	modules/miscutil/lib/build
	modules/webaccess/bin/authaction
	modules/webaccess/bin/webaccessadmin
	modules/webalert/bin/alertengine
	modules/webmessage/bin/webmessageadmin
	modules/websearch/bin/webcoll
	modules/websession/bin/inveniogc
	modules/webstat/bin/webstat
	modules/webstat/bin/webstatadmin
	modules/webstyle/bin/gotoadmin
	modules/webstyle/bin/webdoc
	modules/websubmit/bin/bibdocfile
	modules/websubmit/bin/inveniounoconv
	modules/websubmit/bin/websubmitadmin
	modules/bibcirculation/bin/bibcircd
	tags
	config.status.lineno
	configure.lineno
	*.kdevelop
	*.kdevses
	.project
	.settings
	.pydevproject
	org.eclipse.core.resources.prefs
	diff --git a/modules/docextract/etc/collaborations.kb b/modules/docextract/etc/collaborations.kb
	index df0916f15..f84d225a5 100644
	--- a/modules/docextract/etc/collaborations.kb
	+++ b/modules/docextract/etc/collaborations.kb
	@@ -1,31 +1,32 @@
	## This file holds text which must be recognised alongside authors, and hence included in the $h subfields.
	## Matches using this data do not affect how references are split.
	## (Just simply appends to the most recent $h subfield for the datafield, or makes a new one).
	## Do not append an 's' to the end.
	## Insert only the Upper cased version.
	CMS Collaboration---CMS Collaboration
	ATLAS Collaboration---ATLAS Collaboration
	ALICE Collaboration---ALICE Collaboration
	LEP Collaboration---LEP Collaboration
	CDF Collaboration---CDF Collaboration
	D0 Collaboration---D0 Collaboration
	ALEPH Collaboration---ALEPH Collaboration
	DELPHI Collaboration---DELPHI Collaboration
	L3 Collaboration---L3 Collaboration
	OPAL Collaboration---OPAL Collaboration
	CTEQ Collaboration---CTEQ Collaboration
	GEANT4 Collaboration---GEANT4 Collaboration
	LHC-B Collaboration---LHC-B Collaboration
	CDF II Collaboration---CDF II Collaboration
	RD 48 Collaboration---RD 48 Collaboration
	SLD Collaboration---SLD Collaboration
	H1 Collaboration---H1 Collaboration
	COMPASS Collaboration---COMPASS Collaboration
	HERMES Collaboration---HERMES Collaboration
	European Muon Collaboration---European Muon Collaboration
	Spin Muon Collaboration---Spin Muon Collaboration
	E143 Collaboration---E143 Collaboration
	Particle Data Group Collaboration---Particle Data Group Collaboration
	ATLAS Inner Detector software group Collaboration---ATLAS Inner Detector software group Collaboration
	DØ Collaboration---DØ Collaboration
	CUORE Collaboration---CUORE Collaboration
	+Belle Collaboration---Belle Collaboration
	diff --git a/modules/docextract/etc/report-numbers.kb b/modules/docextract/etc/report-numbers.kb
	index 5163cd997..5ef693af9 100644
	--- a/modules/docextract/etc/report-numbers.kb
	+++ b/modules/docextract/etc/report-numbers.kb
	@@ -1,236 +1,250 @@
	***LANL***
	<s/syymm999>
	<syymm999>

	ACC PHYS ---acc-phys
	ADAP ORG ---adap-org
	ALG GEOM ---alg-geom
	AO SCI ---ao-sci
	AUTO FMS ---auto-fms
	BAYES AN ---bayes-an
	CD HG ---cd-hg
	CMP LG ---cmp-lg
	COMP GAS ---comp-gas
	DG GA ---dg-ga
	FUNCT AN ---funct-an
	GR QC ---gr-qc
	ARXIVHEP EX ---hep-ex
	ARXIVHEP PH ---hep-ph
	ARXIVHEP TH ---hep-th
	LC OM ---lc-om
	MTRL TH ---mtrl-th
	NEURO CEL ---neuro-cel
	NEURO DEV ---neuro-dev
	NEURO SCI ---neuro-sci
	PATT SOL ---patt-sol


	***FermiLab***
	< 9999>
	< 999>
	< yy 999 [AET ]>
	< yyyy 999 [AET ]>
	< yyyy 99>

	FERMILAB CONF ---FERMILAB-Conf
	FERMILAB FN ---FERMILAB-FN
	FERMILAB PUB ---FERMILAB-Pub
	FERMILAB TM ---FERMILAB-TM
	FERMILAB DESIGN ---FERMILAB-Design
	FERMILAB THESIS ---FERMILAB-Thesis
	FERMILAB MASTERS---FERMILAB-Masters

	***Fermilab DØ notes***
	< 9999>

	DØ NOTE---DØ-Note

	***CERN***
	< yy 999>
	<syyyy 999>

	ALEPH ---ALEPH
	ALICE ---ALICE
	ALICE INT ---ALICE-INT
	ALICE NOTE ---ALICE-INT
	ATL CAL ---ATL-CAL
	ATL COM ---ATL-COM
	ATL COM SOFT ---ATL-COM-SOFT
	ATL COM PUB ---ATL-COM-DAQ
	ATL COM DAQ ---ATL-COM-DAQ
	ATL COM MUON ---ATL-COM-MUON
	ATL COM PHYS ---ATL-COM-PHYS
	TL COM PHYS ---ATL-COM-PHYS
	ATL COM TILECAL ---ATL-COM-TILECAL
	ATL COM LARG ---ATL-COM-LARG
	ATL DAQ ---ATL-DAQ
	ATL DAQ CONF ---ATL-DAQ-CONF
	ATL GEN ---ATL-GEN
	ATL INDET ---ATL-INDET
	ATL LARG ---ATL-LARG
	ATL MUON ---ATL-MUON
	ATL PUB MUON ---ATL-PUB-MUON
	ATL PHYS ---ATL-PHYS
	ATL PHYS PUB ---ATL-PHYS-PUB
	ATL PHYSPUB ---ATL-PHYS-PUB
	ATLPHYS PUB ---ATL-PHYS-PUB
	ATL PHYS INT ---ATL-PHYS-INT
	ATL PHYSINT ---ATL-PHYS-INT
	ATLPHYS INT ---ATL-PHYS-INT
	ATL TECH ---ATL-TECH
	ATL TILECAL ---ATL-TILECAL
	ATL SOFT ---ATL-SOFT
	ATL SOFT PUB ---ATL-SOFT-PUB
	ATL IS EN ---ATL-IS-EN
	ATL IS QA ---ATL-IS-QA
	ATL LARG PUB ---ATL-LARG-PUB
	ATL COM LARG ---ATL-COM-LARG
	TL COM LARG ---ATL-COM-LARG
	ATLCOM LARG ---ATL-COM-LARG
	ATL MAGNET PUB ---ATL-MAGNET-PUB
	CERN AB ---CERN-AB
	CERN ALEPH ---CERN-ALEPH
	CERN ALEPH PHYSIC ---CERN-ALEPH-PHYSIC
	CERN ALEPH PUB ---CERN-ALEPH-PUB
	CERN ALICE INT ---CERN-ALICE-INT
	CERN ALICE PUB ---CERN-ALICE-PUB
	CERN ALI ---CERN-ALI
	CERN AS ---CERN-AS
	CERN AT ---CERN-AT
	CERN ATL COM CAL ---CERN-ATL-COM-CAL
	CERN ATL COM DAQ ---CERN-ATL-COM-DAQ
	CERN ATL COM GEN ---CERN-ATL-COM-GEN
	CERN ATL COM INDET ---CERN-ATL-COM-INDET
	CERN ATL COM LARG ---CERN-ATL-COM-LARG
	CERN ATL COM MUON ---CERN-ATL-COM-MUON
	CERN ATL COM PHYS ---CERN-ATL-COM-PHYS
	CERN ATL COM TECH ---CERN-ATL-COM
	CERN ATL COM TILECAL ---CERN-ATL-COM
	CERN ATL DAQ ---CERN-ATL-DAQ
	CERN ATL SOFT ---CERN-ATL-SOFT
	CERN ATL SOFT INT ---CERN-ATL-SOFT-INT
	CERN ATL SOFT PUB ---CERN-ATL-SOFT-PUB
	CERN CMS ---CERN-CMS
	CERN CMS CR ---CERN-CMS-CR
	CERN CMS NOTE ---CERN-CMS-NOTE
	CERN CN ---CERN-CN
	CERN DD ---CERN-DD
	CERN DELPHI ---CERN-DELPHI
	CERN ECP ---CERN-ECP
	CERN EF ---CERN-EF
	CERN ECP ---CERN-EP
	CERN EST ---CERN-EST
	CERN ETT ---CERN-ETT
	CERN IT ---CERN-IT
	CERN LHCB ---CERN-LHCB
	CERN LHCC ---CERN-LHCC
	CERN LHC ---CERN-LHC
	CERN LHC PHO ---CERN-LHC-PHO
	CERN LHC PROJECT REPORT---CERN-LHC-Project-Report
	CERN OPEN ---CERN-OPEN
	CERN PPE ---CERN-PPE
	CERN PS ---CERN-PS
	CERN SL ---CERN-SL
	CERN SPSC ---CERN-SPSC
	CERN ST ---CERN-ST
	CERN TH ---CERN-TH
	CERN THESIS ---CERN-THESIS
	CERN TIS ---CERN-TIS
	CERN ATS ---CERN-ATS
	CERN ---CERN
	CMS CR ---CMS-CR
	CMS NOTE ---CMS-NOTE
	CMS EXO ---CMS-EXO
	LHCB ---LHCB
	SN ATLAS ---SN-ATLAS
	PAS SUSY ---CMS-PAS-SUS
	CMS PAS EXO ---CMS-PAS-EXO
	CMS PAS HIN ---CMS-PAS-HIN
	CMS PAS QCD ---CMS-PAS-QCD
	CMS PAS TOP ---CMS-PAS-TOP
	CMS PAS SUS ---CMS-PAS-SUS
	CMS PAS BPH ---CMS-PAS-BPH
	CMS PAS SMP ---CMS-PAS-SMP
	CMS PAS HIG ---CMS-PAS-HIG
	CMS PAS EWK ---CMS-PAS-EWK
	CMS PAS BTV ---CMS-PAS-BTV
	CMS PAS FWD ---CMS-PAS-FWD
	CMS PAS TRK ---CMS-PAS-TRK
	CMS PAS SMP ---CMS-PAS-SMP
	CMS PAS PFT ---CMS-PAS-PFT
	CMS PAS MUO ---CMS-PAS-MUO
	CMS PAS JME ---CMS-PAS-JME
	CMS PAS EGM ---CMS-PAS-EGM
	CMS PAS DIF ---CMS-PAS-DIF
	ATLTILECAL PUB ---ATLTILECAL-PUB
	ATLAS TECH PUB ---ATLAS-TECH-PUB
	TLCOM MAGNET ---TLCOM-MAGNET
	ATLLARG ---ATL-LARG

	***CERN MORE***
	< yyyy 999>
	< yyyy 99>
	< yyyy 9>
	< yy 99>
	< yy 9>
	CERN LHCB ---CERN-LHCB
	CERN LHCC ---CERN-LHCC
	CERN PHESS ---CERN-PHESS


	***CERN EVEN MORE***
	< 9>

	CMS UG TP ---CMS-UG-TP


	***CERN DIFFERENT FORMAT***
	< 9999999>
	CERN GE ---CERN-GE


	***LHC***
	< 999>
	< 9999>

	CERN CLIC NOTE ---CERN-CLIC-Note
	LHC PROJECT NOTE ---LHC-Project-Note
	CERN LHC PROJECT REPORT ---CERN-LHC-Project-Report
	LHC PROJECT REPORT ---CERN-LHC-Project-Report
	CLIC NOTE ---CERN-CLIC-Note
	ATLAS TDR ---ATL-TDR
	CMS TDR ---CMS-TDR
	ATC TT ID ---ATC-TT-ID
	ATC TT IN ---ATC-TT-IN
	LHCCP ---LHCCP

	***KEK***
	< 9999>
	< yy 999>
	< yyyy 999>

	KEK CP ---KEK-CP
	KEK INT ---KEK-Internal
	KEK INTERNAL ---KEK-Internal
	KEK PREPRINT ---KEK-Preprint
	KEK TH ---KEK-TH


	***DESY***
	< yy 999>
	< yyyy 999>

	DESY ---DESY
	-DESY M ---DESY M
	+DESY M ---DESY-M
	+DESY-THESIS ---DESY-THESIS
	+
	+
	+***DESY F***
	+<99 9>
	+<9 99 99>
	+<99 99 99>
	+
	+DESY F ---DESY-F


	***SLAC***
	< 999>
	< 9999>
	< yy 99>

	SLAC AP ---SLAC-AP
	SLAC PUB ---SLAC-PUB
	SLAC R ---SLAC-R
	SLAC TN ---SLAC-TN
	SLAC WP ---SLAC-WP
	+
	+
	+***Berkeley Lab***
	+< 99999>
	+LBNL---LBNL
	diff --git a/modules/docextract/lib/Makefile.am b/modules/docextract/lib/Makefile.am
	index ab6a60506..5a9faff0a 100644
	--- a/modules/docextract/lib/Makefile.am
	+++ b/modules/docextract/lib/Makefile.am
	@@ -1,51 +1,52 @@
	## This file is part of Invenio.
	## Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2013 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	pylibdir = $(libdir)/python/invenio

	pylib_DATA = docextract_pdf.py \
	docextract_text.py \
	+ docextract_templates.py \
	docextract_utils.py \
	docextract_task.py \
	docextract_webinterface.py \
	docextract_webinterface_unit_tests.py \
	docextract_record.py \
	docextract_record_regression_tests.py \
	docextract_convert_journals.py \
	refextract.py \
	refextract_task.py \
	refextract_config.py \
	refextract_engine.py \
	refextract_re.py \
	refextract_api.py \
	refextract_api_unit_tests.py \
	refextract_api_regression_tests.py \
	refextract_text.py \
	refextract_record.py \
	refextract_find.py \
	refextract_tag.py \
	refextract_cli.py \
	refextract_kbs.py \
	refextract_linker.py \
	refextract_regression_tests.py \
	refextract_unit_tests.py \
	authorextract_re.py


	EXTRA_DIST = $(pylib_DATA)

	CLEANFILES = ~ .tmp *.pyc
	diff --git a/modules/docextract/lib/authorextract_re.py b/modules/docextract/lib/authorextract_re.py
	index d54e9f678..89f1e489a 100644
	--- a/modules/docextract/lib/authorextract_re.py
	+++ b/modules/docextract/lib/authorextract_re.py
	@@ -1,451 +1,461 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	import re
	import sys
	from invenio.docextract_utils import write_message
	from invenio.refextract_config import CFG_REFEXTRACT_KBS


	def get_author_affiliation_numeration_str(punct=None):
	"""The numeration which can be applied to author names. Numeration
	is sometimes found next to authors of papers.
	@return: (string), which can be compiled into a regex; identifies
	numeration next to an author name.
	"""

	##FIXME cater for start or end numeration (ie two puncs)

	## Number to look for, either general or specific
	re_number = '(?:\d\d?)'
	re_chained_numbers = "(?:(?:[,;]\s%s\.?\s))*" % re_number
	## Punctuation surrounding the number, either general or specific again
	if punct is None:
	re_punct = "(?:[\{\(\[]?)"
	else:
	re_punct = re.escape(punct)

	## Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!)
	numeration_str = """
	(?:\s(%(punct)s)\s ## Left numeration punctuation
	(%(num)s\s* ## Core numeration item, either specific or generic
	%(num_chain)s ## Extra numeration, either generic or empty
	)
	(?:(%(punct)s)) ## Right numeration punctuation
	)""" % {'num' : re_number,
	'num_chain' : re_chained_numbers,
	'punct' : re_punct}
	return numeration_str


	+letter_re = re.compile(ur'(\w)', re.U)
	+letters = set(unichr(n) for n in xrange(1, 0x10000))
	+letters -= set(u'%s' % n for n in xrange(0, 10))
	+letters -= set(['_'])
	+uppercase_letters = set(c.upper() for c in letters if letter_re.match(c))
	+UPPERCASE_RE = ur'[%s]' % ''.join(uppercase_letters)
	+
	def get_initial_surname_author_pattern(incl_numeration=False):
	"""Match an author name of the form: 'initial(s) surname'

	Return a standard author, with a maximum of 6 initials, and a surname.
	The author pattern returned will match 'Initials Surname' formats only.
	The Initials MUST be uppercase, and MUST have at least a dot, hypen or apostrophe between them.
	@param incl_numeration: (boolean) Return an author pattern with optional numeration after authors.
	@return (string): The 'Initials Surname' author pattern."""
	# Possible inclusion of superscript numeration at the end of author names
	# Will match the empty string
	if incl_numeration:
	append_num_re = get_author_affiliation_numeration_str() + '?'
	else:
	append_num_re = ""

	return ur"""
	(?:
	- (?:[A-Z]\w{2,20}\s+)? ## Optionally a first name before the initials
	+ (?:%(uppercase_re)s\w{2,20}\s+)? ## Optionally a first name before the initials

	(?<!Volume\s) ## Initials (1-5) (cannot follow 'Volume\s')
	- [A-Z](?:\s[.'’\s-]{1,3}\s[A-Z]){0,4}[.\s-]{1,2}\s* ## separated by .,-,',etc.
	+ %(uppercase_re)s(?:\s[.'’\s-]{1,3}\s%(uppercase_re)s){0,4}[.\s-]{1,2}\s* ## separated by .,-,',etc.

	- (?:[A-Z]\w{2,20}\s+)? ## Optionally a first name after the initials
	+ (?:%(uppercase_re)s\w{2,20}\s+)? ## Optionally a first name after the initials

	(?:
	(?!%(invalid_prefixes)s) ## Invalid prefixes to avoid
	- [A-Za-z]{1,3}(?<!and)(?:(?:[’'`´-]\s?)\|\s) ## The surname prefix: 1, 2 or 3
	+ \w{1,3}(?<!and)(?:(?:[’'`´-]\s?)\|\s) ## The surname prefix: 1, 2 or 3
	)? ## character prefixes before the surname (e.g. 'van','de')

	(?!%(invalid_surnames)s) ## Invalid surnames to avoid
	- [A-Z] ## The surname, which must start with an upper case character
	+ %(uppercase_re)s ## The surname, which must start with an upper case character
	(?:[rR]\.\|\w{1,20}) ## handle Jr.
	(?:[\-’'`´][\w’']{1,20})? ## single hyphen allowed jan-el or Figueroa-O'Farrill
	[’']? ## Eventually an ending '

	%(numeration)s ## A possible number to appear after an author name, used for author extraction

	(?: # Look for editor notation after the author group...
	\s,?\s # Eventually a coma/space
	%(ed)s
	)?
	)""" % {
	+ 'uppercase_re' : UPPERCASE_RE,
	'invalid_prefixes': '\|'.join(invalid_prefixes),
	'invalid_surnames': '\|'.join(invalid_surnames),
	'ed' : re_ed_notation,
	'numeration' : append_num_re,
	}


	def get_surname_initial_author_pattern(incl_numeration=False):
	"""Match an author name of the form: 'surname initial(s)'

	This is sometimes the represention of the first author found inside an author group.
	This author pattern is only used to find a maximum of ONE author inside an author group.
	Authors of this form MUST have either a comma after the initials, or an 'and',
	which denotes the presence of other authors in the author group.
	@param incl_numeration: (boolean) Return an author pattern with optional numeration after authors.
	@return (string): The 'Surname Initials' author pattern."""
	# Possible inclusion of superscript numeration at the end of author names
	# Will match the empty string
	if incl_numeration:
	append_num_re = get_author_affiliation_numeration_str() + '?'
	else:
	append_num_re = ""

	return ur"""
	(?:
	(?:
	(?!%(invalid_prefixes)s) ## Invalid prefixes to avoid
	- [A-Za-z]{1,3}(?<!and)(?<!in)(?:(?:[’'`´-]\s?)\|\s)
	+ \w{1,3}(?<!and)(?<!in)(?:(?:[’'`´-]\s?)\|\s)
	)? ## The optional surname prefix:
	## 1 or 2, 2-3 character prefixes before the surname (e.g. 'van','de')

	(?!%(invalid_surnames)s) ## Invalid surnames to avoid
	- [A-Z]\w{2,20}(?:[\-’'`´]\w{2,20})? ## The surname, which must start with an upper case character (single hyphen allowed)
	+ %(uppercase_re)s\w{2,20}(?:[\-’'`´]\w{2,20})? ## The surname, which must start with an upper case character (single hyphen allowed)

	\s[,.\s]\s ## The space between the surname and its initials

	(?<!Volume\s) ## Initials
	- [A-Z](?:\s[.'’\s-]{1,2}\s[A-Z]){0,4}\.{0,2} ##
	+ %(uppercase_re)s(?:\s[.'’\s-]{1,2}\s%(uppercase_re)s){0,4}\.{0,2}

	## Either a comma or an 'and' MUST be present ... OR an end of line marker
	## (maybe some space's between authors)
	## Uses positive lookahead assertion
	(?: # Look for editor notation after the author group...
	\s,?\s # Eventually a coma/space
	%(ed)s
	)?
	)""" % {
	+ 'uppercase_re' : UPPERCASE_RE,
	'invalid_prefixes': '\|'.join(invalid_prefixes),
	'invalid_surnames': '\|'.join(invalid_surnames),
	'ed' : re_ed_notation,
	'numeration' : append_num_re,
	}


	invalid_surnames = (
	- 'Supergravity', 'Collaboration', 'Theoretical', 'Appendix', 'Phys', 'Paper'
	+ 'Supergravity', 'Collaboration', 'Theoretical', 'Appendix', 'Phys',
	+ 'Paper', 'Energy'
	)
	invalid_prefixes = (
	'at',
	)


	def make_auth_regex_str(etal, initial_surname_author=None, surname_initial_author=None):
	"""
	Returns a regular expression to be used to identify groups of author names in a citation.
	This method contains patterns for default authors, so no arguments are needed for the
	most reliable form of matching.

	The returned author pattern is capable of:
	1. Identifying single authors, with at least one initial, of the form:
	'Initial. [surname prefix...] Surname'

	2. Identifying multiple authors, each with at least one initial, of the form:
	'Initial. [surname prefix...] Surname, [and] [Initial. [surname prefix...] Surname ... ]'
	***(Note that a full stop, hyphen or apostrophe after each initial is
	absolutely vital in identifying authors for both of these above methods.
	Initials must also be uppercase.)***

	3. Capture 'et al' statements at the end of author groups (allows for authors with et al
	to be processed differently from 'standard' authors)

	4. Identifying a single author surname name positioned before the phrase 'et al',
	with no initials: 'Surname et al'

	5. Identifying two author surname name positioned before the phrase 'et al',
	with no initials, but separated by 'and' or '&': 'Surname [and\|&] Surname et al'

	6. Identifying authors of the form:
	'Surname Initials, Initials Surname [Initials Surname]...'. Some authors choose
	to represent the most important cited author (in a list of authors) by listing first
	their surname, and then their initials. Since this form has little distinguishing
	characteristics which could be used to create a reliable a pattern, at least one
	standard author must be present after it in order to improve the accuracy.

	7. Capture editor notation, of which can take many forms e.g.
	'eds. editors. edited by. etc.'. Authors captured in this way can be treated as
	'editor groups', and hence processed differently if needed from standard authors

	@param etal: (string) The regular expression used to identify 'etal' notation
	@param author: (string) An optional argument, which replaces the default author
	regex used to identify author groups (initials, surnames... etc)

	@return: (string) The full author group identification regex, which will:
	- detect groups of authors in a range of formats, e.g.:
	C. Hayward, V van Edwards, M. J. Woodbridge, and L. Kelloggs et al.,
	- detect whether the author group has been marked up as editors of the doc.
	(therefore they will NOT be marked up as authors) e.g.:
	ed. C Hayward \| (ed) V van Edwards \| ed by, M. J. Woodbridge and V van Edwards
	\| L. Kelloggs (editors) \| M. Jackson (eds.) \| ...
	-detect a maximum of two surnames only if the surname(s) is followed by 'et al'
	(must be separated by 'and' if there are two), e.g.:
	Amaldi et al., \| Hayward and Yellow et al.,
	"""
	if not initial_surname_author:
	## Standard author, with a maximum of 6 initials, and a surname.
	## The Initials MUST be uppercase, and MUST have at least a dot, hypen or apostrophe between them.
	initial_surname_author = get_initial_surname_author_pattern()

	if not surname_initial_author:
	## The author name of the form: 'surname initial(s)'
	## This is sometimes the represention of the first author found inside an author group.
	## This author pattern is only used to find a maximum of ONE author inside an author group.
	## Authors of this form MUST have either a comma after the initials, or an 'and',
	## which denotes the presence of other authors in the author group.
	surname_initial_author = get_surname_initial_author_pattern()

	## Pattern used to locate a GROUP of author names in a reference
	## The format of an author can take many forms:
	## J. Bloggs, W.-H. Smith, D. De Samuel, G.L. Bayetian, C. Hayward et al.,
	## (the use of 'et. al' is a giveaway that the preceeding
	## text was indeed an author name)
	## This will also match authors which seem to be labeled as editors (with the phrase 'ed.')
	## In which case, the author will be thrown away later on.
	## The regex returned has around 100 named groups already (max), so any new groups must be
	## started using '?:'

	return ur"""
	(?:^\|\s+\|\() ## Must be the start of the line, or a space (or an opening bracket in very few cases)
	(?P<es> ## Look for editor notation before the author
	(?:(?:(?:[Ee][Dd]s?\|[Ee]dited\|[Ee]ditors?)((?:\.\s?)\|(?:\.?\s))) ## 'eds?. ' \| 'ed ' \| 'ed.'
	\|(?:(?:[Ee][Dd]s?\|[Ee]dited\|[Ee]ditions?)(?:(?:\.\s?)\|(?:\.?\s))by(?:\s\|([:,]\s))) ## 'eds?. by, ' \| 'ed. by: ' \| 'ed by ' \| 'ed. by '\| 'ed by: '
	\|(?:$\s?([Ee][Dd]s?\|[Ee]dited\|[Ee]ditors?)(?:(?:\.\s?)\|(?:\.?\s))?$)) ## '( eds?. )' \| '(ed.)' \| '(ed )' \| '( ed )' \| '(ed)'
	)?

	## **** (1) , one or two surnames which MUST end with 'et al' (e.g. Amaldi et al.,)
	(?P<author_names>
	(?:
	(?:[A-Z](?:\s[.'’-]{1,2}\s[A-Z]){0,4}[.\s]\s*)? ## Initials
	[A-Z][^0-9_\.\s]{2,20}(?:(?:[,\.]\s*)\|(?:[,\.]?\s+)) ## Surname
	(?:[A-Z](?:\s[.'’-]{1,2}\s[A-Z]){0,4}[.\s]\s*)? ## Initials
	(?P<multi_surs>
	(?:(?:[Aa][Nn][Dd]\|\&)\s+) ## Maybe 'and' or '&' tied with another name
	[A-Z][^0-9_\.\s]{3,20}(?:(?:[,\.]\s*)\|(?:[,\.]?\s+)) ## More surnames
	(?:[A-Z](?:[ -][A-Z])?\s+)? ## with initials
	)?
	(?: # Look for editor notation after the author group...
	\s[,\s]?\s # Eventually a coma/space
	%(ed)s
	)?
	(?P<et2>
	%(etal)s ## et al, MUST BE PRESENT however, for this author form
	)
	(?: # Look for editor notation after the author group...
	\s[,\s]?\s # Eventually a coma/space
	%(ed)s
	)?
	) \|

	(?:
	## **** (2) , The standard author form.. (e.g. J. Bloggs)
	## This author form can either start with a normal 'initial surname' author,
	## or it can begin with a single 'surname initial' author

	(?: ## The first author in the 'author group'
	%(i_s_author)s \|
	(?P<sur_initial_auth>%(s_i_author)s)
	)

	(?P<multi_auth>
	(?: ## Then 0 or more author names
	\s[,\s]\s
	(?:
	%(i_s_author)s \| %(s_i_author)s
	)
	)*

	(?: ## Maybe 'and' or '&' tied with another name
	(?:
	\s[,\s]\s ## handle "J. Dan, and H. Pon"
	(?:[Aa][Nn][DdsS]\|\&)
	\s+
	)
	(?P<mult_auth_sub>
	%(i_s_author)s \| %(s_i_author)s
	)
	)?
	)
	(?P<et> # 'et al' need not be present for either of
	\s[,\s]\s
	%(etal)s # 'initial surname' or 'surname initial' authors
	)?
	)
	)
	(?P<ee>
	\s[,\s]\s
	\(?
	(?:[Ee][Dd]s\|[Ee]ditors)\.?
	\)?
	[\.\,]{0,2}
	)?
	# End of all author name patterns

	\)? # A possible closing bracket to finish the author group
	(?=[\s,.;:]) # Consolidate by checking we are not partially matching
	# something else

	""" % { 'etal' : etal,
	'i_s_author' : initial_surname_author,
	's_i_author' : surname_initial_author,
	'ed' : re_ed_notation }

	## Finding an et. al, before author names indicates a bad match!!!
	## I.e. could be a title match... ignore it
	etal_matches = (
	u' et al.,',
	u' et. al.,',
	u' et. al.',
	u' et.al.,',
	u' et al.',
	u' et al',
	)

	# Editor notation: 'eds?.' \| 'ed.' \| 'ed'
	re_ed_text = ur"(?:[Ee][Dd]\|[Ee]dited\|[Ee]ditor)\.?"
	re_ed_notation = ur"""
	(?:
	\(?
	%(text)s
	\s?
	\)?
	[\.\,]{0,2}
	)""" % {'text': re_ed_text}

	## Standard et al ('and others') pattern for author recognition
	re_etal = ur"""[Ee][Tt](?:(?:(?:,\|\.)\s*)\|(?:(?:,\|\.)?\s+))[Aa][Ll][,\.]?[,\.]?"""

	## The pattern used to identify authors inside references
	re_auth = (re.compile(make_auth_regex_str(re_etal), re.VERBOSE\|re.UNICODE))

	## Given an Auth hit, some misc text, and then another Auth hit straight after,
	## (OR a bad_and was found)
	## check the entire misc text to see if is 'looks' like an author group, which didn't match
	## as a normal author. In which case, append it to the single author group.
	## PLEASE use this pattern only against space stripped text.
	## IF a bad_and was found (from above).. do re.search using this pattern
	## ELIF an auth-misc-auth combo was hit, do re.match using this pattern
	re_weaker_author = ur"""
	## look closely for initials, and less closely at the last name.
	(?:([A-Z]((\.\s?)\|(\.?\s+)\|(\-))){1,5}
	(?:[^\s_<>0-9]+(?:(?:[,\.]\s*)\|(?:[,\.]?\s+)))+)"""

	## End of line MUST match, since the next string is definitely a portion of an author group (append '$')
	re_auth_near_miss = re.compile(make_auth_regex_str(
	re_etal, "(" + re_weaker_author + ")+$"), re.VERBOSE\|re.UNICODE)

	## Used as a weak mechanism to classify possible authors above identified affiliations
	## (start) Firstname SurnamePrefix Surname (end)
	re_ambig_auth = re.compile(u"^\s[A-Z][^\s_<>0-9]+\s+([^\s_<>0-9]{1,3}\.?\s+)?[A-Z][^\s_<>0-9]+\s$", \
	re.UNICODE)

	## Obtain the compiled expression which includes the proper author numeration
	## (The pattern used to identify authors of papers)
	## This pattern will match groups of authors, from the start of the line
	re_auth_with_number = re.compile(make_auth_regex_str(
	re_etal,
	get_initial_surname_author_pattern(incl_numeration=True),
	get_surname_initial_author_pattern(incl_numeration=True)
	), re.VERBOSE \| re.UNICODE)

	## Used to obtain authors chained by connectives across multiple lines
	re_comma_or_and_at_start = re.compile("^(,\|((,\s*)?[Aa][Nn][Dd]\|&))\s", re.UNICODE)


	def make_collaborations_regex_str():
	""" From the authors knowledge-base, construct a single regex holding the or'd possibilities of patterns
	which should be included in $h subfields. The word 'Collaboration' is also converted to 'Coll', and
	used in finding matches. Letter case is not considered during the search.
	@return: (string) The single pattern built from each line in the author knowledge base.
	"""
	def add_to_auth_list(s):
	""" Strip the line, replace spaces with '\s' and append 'the' to the start
	and 's' to the end. Add the prepared line to the list of extra kb authors."""
	s = u"(?:the\s)?" + s.strip().replace(u' ', u'\s') + u"s?"
	auths.append(s)

	## Build the 'or'd regular expression of the author lines in the author knowledge base
	auths = []
	fpath = CFG_REFEXTRACT_KBS['collaborations']

	try:
	fh = open(fpath, "r")
	except IOError:
	## problem opening KB for reading, or problem while reading from it:
	emsg = """Error: Could not build knowledge base containing """ \
	"""author patterns - failed """ \
	"""to read from KB %(kb)s.\n""" \
	% {'kb' : fpath}
	write_message(emsg, sys.stderr, verbose=0)
	raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)

	for line_num, rawline in enumerate(fh):
	try:
	rawline = rawline.decode("utf-8")
	except UnicodeError:
	write_message("*** Unicode problems in %s for line %d" \
	% (fpath, line_num), sys.stderr, verbose=0)
	raise UnicodeError("Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
	if rawline.strip() and rawline[0].strip() != '#':
	add_to_auth_list(rawline)
	## Shorten collaboration to 'coll'
	if rawline.lower().endswith('collaboration\n'):
	coll_version = rawline[:rawline.lower().find(u'collaboration\n')] + u"coll[\.\,]"
	add_to_auth_list(coll_version.strip().replace(' ', '\s') + u"s?")

	author_match_re = ""
	if len(auths) > 0:
	author_match_re = u'\|'.join([u"(?:" + a + u")" for a in auths])
	author_match_re = ur"(?:(?:[\(\"]?(?P<extra_auth>" + \
	author_match_re + ur")[\)\"]?[\,\.]?\s?(?:and\s)?)+)"

	return author_match_re

	## Create the regular expression used to find user-specified 'extra' authors
	## (letter case is not concidered when matching)
	re_collaborations = re.compile(make_collaborations_regex_str(), re.I\|re.U)


	def get_single_author_pattern():
	"""Generates a simple, one-hit-only, author name pattern, matching just one author
	name in either of the 'S I' or 'I S' formats. The author patterns are the same
	ones used inside the main 'author group' pattern generator. This function is used
	not for reference extraction, but for author extraction. Numeration is appended
	to author patterns by default.
	@return (string): Just the author name pattern designed to identify single author names
	in both SI and IS formats. (NO 'et al', editors, 'and'... matching)
	@return: (string) the union of 'initial surname' and 'surname initial'
	authors"""
	return "(?:"+ get_initial_surname_author_pattern(incl_numeration=True) + \
	"\|" + get_surname_initial_author_pattern(incl_numeration=True) + ")"


	## Targets single author names
	re_single_author_pattern = re.compile(get_single_author_pattern(), re.VERBOSE)
	diff --git a/modules/docextract/lib/docextract_pdf.py b/modules/docextract/lib/docextract_pdf.py
	index 5fa633c02..03d7b91ac 100644
	--- a/modules/docextract/lib/docextract_pdf.py
	+++ b/modules/docextract/lib/docextract_pdf.py
	@@ -1,501 +1,519 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	"""
	When a document is converted to plain-text from PDF,
	certain characters may result in the plain-text, that are
	either unwanted, or broken. These characters need to be corrected
	or removed. Examples are, certain control characters that would
	be illegal in XML and must be removed; TeX ligatures (etc); broken
	accents such as umlauts on letters that must be corrected.
	This function returns a dictionary of (unwanted) characters to look
	for and the characters that should be used to replace them.
	@return: (dictionary) - { seek -> replace, } or charsacters to
	replace in plain-text.
	"""

	import re
	import subprocess

	from invenio.config import CFG_PATH_PDFTOTEXT
	from invenio.docextract_utils import write_message

	# a dictionary of undesirable characters and their replacements:
	UNDESIRABLE_CHAR_REPLACEMENTS = {
	# Control characters not allowed in XML:
	u'\u2028' : u"",
	u'\u2029' : u"",
	u'\u202A' : u"",
	u'\u202B' : u"",
	u'\u202C' : u"",
	u'\u202D' : u"",
	u'\u202E' : u"",
	u'\u206A' : u"",
	u'\u206B' : u"",
	u'\u206C' : u"",
	u'\u206D' : u"",
	u'\u206E' : u"",
	u'\u206F' : u"",
	u'\uFFF9' : u"",
	u'\uFFFA' : u"",
	u'\uFFFB' : u"",
	u'\uFFFC' : u"",
	u'\uFEFF' : u"",
	- # Remove the result of an bad UTF-8 character
	+ # Remove the result of a bad UTF-8 character
	u'\uFFFF' : u"",
	# Language Tag Code Points:
	u"\U000E0000" : u"",
	u"\U000E0001" : u"",
	u"\U000E0002" : u"",
	u"\U000E0003" : u"",
	u"\U000E0004" : u"",
	u"\U000E0005" : u"",
	u"\U000E0006" : u"",
	u"\U000E0007" : u"",
	u"\U000E0008" : u"",
	u"\U000E0009" : u"",
	u"\U000E000A" : u"",
	u"\U000E000B" : u"",
	u"\U000E000C" : u"",
	u"\U000E000D" : u"",
	u"\U000E000E" : u"",
	u"\U000E000F" : u"",
	u"\U000E0010" : u"",
	u"\U000E0011" : u"",
	u"\U000E0012" : u"",
	u"\U000E0013" : u"",
	u"\U000E0014" : u"",
	u"\U000E0015" : u"",
	u"\U000E0016" : u"",
	u"\U000E0017" : u"",
	u"\U000E0018" : u"",
	u"\U000E0019" : u"",
	u"\U000E001A" : u"",
	u"\U000E001B" : u"",
	u"\U000E001C" : u"",
	u"\U000E001D" : u"",
	u"\U000E001E" : u"",
	u"\U000E001F" : u"",
	u"\U000E0020" : u"",
	u"\U000E0021" : u"",
	u"\U000E0022" : u"",
	u"\U000E0023" : u"",
	u"\U000E0024" : u"",
	u"\U000E0025" : u"",
	u"\U000E0026" : u"",
	u"\U000E0027" : u"",
	u"\U000E0028" : u"",
	u"\U000E0029" : u"",
	u"\U000E002A" : u"",
	u"\U000E002B" : u"",
	u"\U000E002C" : u"",
	u"\U000E002D" : u"",
	u"\U000E002E" : u"",
	u"\U000E002F" : u"",
	u"\U000E0030" : u"",
	u"\U000E0031" : u"",
	u"\U000E0032" : u"",
	u"\U000E0033" : u"",
	u"\U000E0034" : u"",
	u"\U000E0035" : u"",
	u"\U000E0036" : u"",
	u"\U000E0037" : u"",
	u"\U000E0038" : u"",
	u"\U000E0039" : u"",
	u"\U000E003A" : u"",
	u"\U000E003B" : u"",
	u"\U000E003C" : u"",
	u"\U000E003D" : u"",
	u"\U000E003E" : u"",
	u"\U000E003F" : u"",
	u"\U000E0040" : u"",
	u"\U000E0041" : u"",
	u"\U000E0042" : u"",
	u"\U000E0043" : u"",
	u"\U000E0044" : u"",
	u"\U000E0045" : u"",
	u"\U000E0046" : u"",
	u"\U000E0047" : u"",
	u"\U000E0048" : u"",
	u"\U000E0049" : u"",
	u"\U000E004A" : u"",
	u"\U000E004B" : u"",
	u"\U000E004C" : u"",
	u"\U000E004D" : u"",
	u"\U000E004E" : u"",
	u"\U000E004F" : u"",
	u"\U000E0050" : u"",
	u"\U000E0051" : u"",
	u"\U000E0052" : u"",
	u"\U000E0053" : u"",
	u"\U000E0054" : u"",
	u"\U000E0055" : u"",
	u"\U000E0056" : u"",
	u"\U000E0057" : u"",
	u"\U000E0058" : u"",
	u"\U000E0059" : u"",
	u"\U000E005A" : u"",
	u"\U000E005B" : u"",
	u"\U000E005C" : u"",
	u"\U000E005D" : u"",
	u"\U000E005E" : u"",
	u"\U000E005F" : u"",
	u"\U000E0060" : u"",
	u"\U000E0061" : u"",
	u"\U000E0062" : u"",
	u"\U000E0063" : u"",
	u"\U000E0064" : u"",
	u"\U000E0065" : u"",
	u"\U000E0066" : u"",
	u"\U000E0067" : u"",
	u"\U000E0068" : u"",
	u"\U000E0069" : u"",
	u"\U000E006A" : u"",
	u"\U000E006B" : u"",
	u"\U000E006C" : u"",
	u"\U000E006D" : u"",
	u"\U000E006E" : u"",
	u"\U000E006F" : u"",
	u"\U000E0070" : u"",
	u"\U000E0071" : u"",
	u"\U000E0072" : u"",
	u"\U000E0073" : u"",
	u"\U000E0074" : u"",
	u"\U000E0075" : u"",
	u"\U000E0076" : u"",
	u"\U000E0077" : u"",
	u"\U000E0078" : u"",
	u"\U000E0079" : u"",
	u"\U000E007A" : u"",
	u"\U000E007B" : u"",
	u"\U000E007C" : u"",
	u"\U000E007D" : u"",
	u"\U000E007E" : u"",
	u"\U000E007F" : u"",
	# Musical Notation Scoping
	u"\U0001D173" : u"",
	u"\U0001D174" : u"",
	u"\U0001D175" : u"",
	u"\U0001D176" : u"",
	u"\U0001D177" : u"",
	u"\U0001D178" : u"",
	u"\U0001D179" : u"",
	u"\U0001D17A" : u"",
	u'\u0000' : u"", # NULL
	u'\u0001' : u"", # START OF HEADING
	# START OF TEXT & END OF TEXT:
	u'\u0002' : u"",
	u'\u0003' : u"",
	u'\u0004' : u"", # END OF TRANSMISSION
	# ENQ and ACK
	u'\u0005' : u"",
	u'\u0006' : u"",
	u'\u0007' : u"", # BELL
	u'\u0008' : u"", # BACKSPACE
	# SHIFT-IN & SHIFT-OUT
	u'\u000E' : u"",
	u'\u000F' : u"",
	# Other controls:
	u'\u0010' : u"", # DATA LINK ESCAPE
	u'\u0011' : u"", # DEVICE CONTROL ONE
	u'\u0012' : u"", # DEVICE CONTROL TWO
	u'\u0013' : u"", # DEVICE CONTROL THREE
	u'\u0014' : u"", # DEVICE CONTROL FOUR
	u'\u0015' : u"", # NEGATIVE ACK
	u'\u0016' : u"", # SYNCRONOUS IDLE
	u'\u0017' : u"", # END OF TRANSMISSION BLOCK
	u'\u0018' : u"", # CANCEL
	u'\u0019' : u"", # END OF MEDIUM
	u'\u001A' : u"", # SUBSTITUTE
	u'\u001B' : u"", # ESCAPE
	u'\u001C' : u"", # INFORMATION SEPARATOR FOUR (file separator)
	u'\u001D' : u"", # INFORMATION SEPARATOR THREE (group separator)
	u'\u001E' : u"", # INFORMATION SEPARATOR TWO (record separator)
	u'\u001F' : u"", # INFORMATION SEPARATOR ONE (unit separator)
	# \r -> remove it
	u'\r' : u"",
	# Strange parantheses - change for normal:
	u'\x1c' : u'(',
	u'\x1d' : u')',
	# Some ff from tex:
	u'\u0013\u0010' : u'\u00ED',
	u'\x0b' : u'ff',
	# fi from tex:
	u'\x0c' : u'fi',
	# ligatures from TeX:
	u'\ufb00' : u'ff',
	u'\ufb01' : u'fi',
	u'\ufb02' : u'fl',
	u'\ufb03' : u'ffi',
	u'\ufb04' : u'ffl',
	# Superscripts from TeX
	u'\u2212' : u'-',
	u'\u2013' : u'-',
	# Word style speech marks:
	u'\u201c ': u'"',
	u'\u201d' : u'"',
	u'\u201c' : u'"',
	# pdftotext has problems with umlaut and prints it as diaeresis
	# followed by a letter:correct it
	# (Optional space between char and letter - fixes broken
	# line examples)
	u'\u00A8 a' : u'\u00E4',
	u'\u00A8 e' : u'\u00EB',
	u'\u00A8 i' : u'\u00EF',
	u'\u00A8 o' : u'\u00F6',
	u'\u00A8 u' : u'\u00FC',
	u'\u00A8 y' : u'\u00FF',
	u'\u00A8 A' : u'\u00C4',
	u'\u00A8 E' : u'\u00CB',
	u'\u00A8 I' : u'\u00CF',
	u'\u00A8 O' : u'\u00D6',
	u'\u00A8 U' : u'\u00DC',
	u'\u00A8 Y' : u'\u0178',
	u'\xA8a' : u'\u00E4',
	u'\xA8e' : u'\u00EB',
	u'\xA8i' : u'\u00EF',
	u'\xA8o' : u'\u00F6',
	u'\xA8u' : u'\u00FC',
	u'\xA8y' : u'\u00FF',
	u'\xA8A' : u'\u00C4',
	u'\xA8E' : u'\u00CB',
	u'\xA8I' : u'\u00CF',
	u'\xA8O' : u'\u00D6',
	u'\xA8U' : u'\u00DC',
	u'\xA8Y' : u'\u0178',
	# More umlaut mess to correct:
	u'\x7fa' : u'\u00E4',
	u'\x7fe' : u'\u00EB',
	u'\x7fi' : u'\u00EF',
	u'\x7fo' : u'\u00F6',
	u'\x7fu' : u'\u00FC',
	u'\x7fy' : u'\u00FF',
	u'\x7fA' : u'\u00C4',
	u'\x7fE' : u'\u00CB',
	u'\x7fI' : u'\u00CF',
	u'\x7fO' : u'\u00D6',
	u'\x7fU' : u'\u00DC',
	u'\x7fY' : u'\u0178',
	u'\x7f a' : u'\u00E4',
	u'\x7f e' : u'\u00EB',
	u'\x7f i' : u'\u00EF',
	u'\x7f o' : u'\u00F6',
	u'\x7f u' : u'\u00FC',
	u'\x7f y' : u'\u00FF',
	u'\x7f A' : u'\u00C4',
	u'\x7f E' : u'\u00CB',
	u'\x7f I' : u'\u00CF',
	u'\x7f O' : u'\u00D6',
	u'\x7f U' : u'\u00DC',
	u'\x7f Y' : u'\u0178',
	# pdftotext: fix accute accent:
	u'\x13a' : u'\u00E1',
	u'\x13e' : u'\u00E9',
	u'\x13i' : u'\u00ED',
	u'\x13o' : u'\u00F3',
	u'\x13u' : u'\u00FA',
	u'\x13y' : u'\u00FD',
	u'\x13A' : u'\u00C1',
	u'\x13E' : u'\u00C9',
	u'\x13I' : u'\u00CD',
	u'\x13ı' : u'\u00ED', # Lower case turkish 'i' (dotless i)
	u'\x13O' : u'\u00D3',
	u'\x13U' : u'\u00DA',
	u'\x13Y' : u'\u00DD',
	u'\x13 a' : u'\u00E1',
	u'\x13 e' : u'\u00E9',
	u'\x13 i' : u'\u00ED',
	u'\x13 o' : u'\u00F3',
	u'\x13 u' : u'\u00FA',
	u'\x13 y' : u'\u00FD',
	u'\x13 A' : u'\u00C1',
	u'\x13 E' : u'\u00C9',
	u'\x13 I' : u'\u00CD',
	u'\x13 ı' : u'\u00ED',
	u'\x13 O' : u'\u00D3',
	u'\x13 U' : u'\u00DA',
	u'\x13 Y' : u'\u00DD',
	u'\u00B4 a' : u'\u00E1',
	u'\u00B4 e' : u'\u00E9',
	u'\u00B4 i' : u'\u00ED',
	u'\u00B4 o' : u'\u00F3',
	u'\u00B4 u' : u'\u00FA',
	u'\u00B4 y' : u'\u00FD',
	u'\u00B4 A' : u'\u00C1',
	u'\u00B4 E' : u'\u00C9',
	u'\u00B4 I' : u'\u00CD',
	u'\u00B4 ı' : u'\u00ED',
	u'\u00B4 O' : u'\u00D3',
	u'\u00B4 U' : u'\u00DA',
	u'\u00B4 Y' : u'\u00DD',
	u'\u00B4a' : u'\u00E1',
	u'\u00B4e' : u'\u00E9',
	u'\u00B4i' : u'\u00ED',
	u'\u00B4o' : u'\u00F3',
	u'\u00B4u' : u'\u00FA',
	u'\u00B4y' : u'\u00FD',
	u'\u00B4A' : u'\u00C1',
	u'\u00B4E' : u'\u00C9',
	u'\u00B4I' : u'\u00CD',
	u'\u00B4ı' : u'\u00ED',
	u'\u00B4O' : u'\u00D3',
	u'\u00B4U' : u'\u00DA',
	u'\u00B4Y' : u'\u00DD',
	# pdftotext: fix grave accent:
	u'\u0060 a' : u'\u00E0',
	u'\u0060 e' : u'\u00E8',
	u'\u0060 i' : u'\u00EC',
	u'\u0060 o' : u'\u00F2',
	u'\u0060 u' : u'\u00F9',
	u'\u0060 A' : u'\u00C0',
	u'\u0060 E' : u'\u00C8',
	u'\u0060 I' : u'\u00CC',
	u'\u0060 O' : u'\u00D2',
	u'\u0060 U' : u'\u00D9',
	u'\u0060a' : u'\u00E0',
	u'\u0060e' : u'\u00E8',
	u'\u0060i' : u'\u00EC',
	u'\u0060o' : u'\u00F2',
	u'\u0060u' : u'\u00F9',
	u'\u0060A' : u'\u00C0',
	u'\u0060E' : u'\u00C8',
	u'\u0060I' : u'\u00CC',
	u'\u0060O' : u'\u00D2',
	u'\u0060U' : u'\u00D9',
	+ u'a´': u'á',
	+ u'i´': u'í',
	+ u'e´': u'é',
	+ u'u´': u'ú',
	+ u'o´': u'ó',
	# \02C7 : caron
	u'\u02C7C' : u'\u010C',
	u'\u02C7c' : u'\u010D',
	u'\u02C7S' : u'\u0160',
	u'\u02C7s' : u'\u0161',
	u'\u02C7Z' : u'\u017D',
	u'\u02C7z' : u'\u017E',
	# \027 : aa (a with ring above)
	u'\u02DAa' : u'\u00E5',
	u'\u02DAA' : u'\u00C5',
	# \030 : cedilla
	u'\u0327c' : u'\u00E7',
	u'\u0327C' : u'\u00C7',
	+ u'¸c': u'ç',
	# \02DC : tilde
	u'\u02DCn' : u'\u00F1',
	u'\u02DCN' : u'\u00D1',
	u'\u02DCo' : u'\u00F5',
	u'\u02DCO' : u'\u00D5',
	u'\u02DCa' : u'\u00E3',
	u'\u02DCA' : u'\u00C3',
	u'\u02DCs' : u'\u0303s', # Combining tilde with 's'
	- }
	+ # Circumflex accent (caret accent)
	+ u'aˆ': u'â',
	+ u'iˆ': u'î',
	+ u'eˆ': u'ê',
	+ u'uˆ': u'û',
	+ u'oˆ': u'ô',
	+ u'ˆa': u'â',
	+ u'ˆi': u'î',
	+ u'ˆe': u'ê',
	+ u'ˆu': u'û',
	+ u'ˆo': u'ô',
	+}

	UNDESIRABLE_STRING_REPLACEMENTS = [
	(u'\u201c ', '"'),
	]


	def replace_undesirable_characters(line):
	"""
	Replace certain bad characters in a text line.
	@param line: (string) the text line in which bad characters are to
	be replaced.
	@return: (string) the text line after the bad characters have been
	replaced.
	"""
	+ # These are separate because we want a particular order
	for bad_string, replacement in UNDESIRABLE_STRING_REPLACEMENTS:
	line = line.replace(bad_string, replacement)

	for bad_char, replacement in UNDESIRABLE_CHAR_REPLACEMENTS.iteritems():
	line = line.replace(bad_char, replacement)

	return line


	def pdftotext_conversion_is_bad(txtlines):
	"""Sometimes pdftotext performs a bad conversion which consists of many
	spaces and garbage characters.
	This method takes a list of strings obtained from a pdftotext conversion
	and examines them to see if they are likely to be the result of a bad
	conversion.
	@param txtlines: (list) of unicode strings obtained from pdftotext
	conversion.
	@return: (integer) - 1 if bad conversion; 0 if good conversion.
	"""
	# Numbers of 'words' and 'whitespaces' found in document:
	numWords = numSpaces = 0
	# whitespace character pattern:
	p_space = re.compile(unicode(r'(\s)'), re.UNICODE)
	# non-whitespace 'word' pattern:
	p_noSpace = re.compile(unicode(r'(\S+)'), re.UNICODE)
	for txtline in txtlines:
	numWords = numWords + len(p_noSpace.findall(txtline.strip()))
	numSpaces = numSpaces + len(p_space.findall(txtline.strip()))
	if numSpaces >= (numWords * 3):
	# Too many spaces - probably bad conversion
	return True
	else:
	return False


	def convert_PDF_to_plaintext(fpath, keep_layout=False):
	""" Convert PDF to txt using pdftotext

	Take the path to a PDF file and run pdftotext for this file, capturing
	the output.
	@param fpath: (string) path to the PDF file
	@return: (list) of unicode strings (contents of the PDF file translated
	into plaintext; each string is a line in the document.)
	"""
	if keep_layout:
	layout_option = "-layout"
	else:
	layout_option = "-raw"
	status = 0
	doclines = []
	# Pattern to check for lines with a leading page-break character.
	# If this pattern is matched, we want to split the page-break into
	# its own line because we rely upon this for trying to strip headers
	# and footers, and for some other pattern matching.
	p_break_in_line = re.compile(ur'^\s*\f(.+)$', re.UNICODE)
	# build pdftotext command:
	cmd_pdftotext = [CFG_PATH_PDFTOTEXT, layout_option, "-q",
	"-enc", "UTF-8", fpath, "-"]
	write_message("* %s" % ' '.join(cmd_pdftotext), verbose=2)
	# open pipe to pdftotext:
	pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE)

	# read back results:
	for docline in pipe_pdftotext.stdout:
	unicodeline = docline.decode("utf-8")
	# Check for a page-break in this line:
	m_break_in_line = p_break_in_line.match(unicodeline)
	if m_break_in_line is None:
	# There was no page-break in this line. Just add the line:
	doclines.append(unicodeline)
	else:
	# If there was a page-break character in the same line as some
	# text, split it out into its own line so that we can later
	# try to find headers and footers:
	doclines.append(u"\f")
	doclines.append(m_break_in_line.group(1))

	write_message("* convert_PDF_to_plaintext found: " \
	"%s lines of text" % len(doclines), verbose=2)

	# finally, check conversion result not bad:
	if pdftotext_conversion_is_bad(doclines):
	status = 2
	doclines = []

	return (doclines, status)
	diff --git a/modules/docextract/lib/docextract_task.py b/modules/docextract/lib/docextract_task.py
	index 6e7dd1425..5c9130cb7 100644
	--- a/modules/docextract/lib/docextract_task.py
	+++ b/modules/docextract/lib/docextract_task.py
	@@ -1,205 +1,210 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011, 2012 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""Generic Framework for extracting metadata from records using bibsched"""

	import traceback

	from datetime import datetime
	from itertools import chain
	from invenio.bibtask import task_get_option, write_message, \
	task_sleep_now_if_required, \
	task_update_progress
	from invenio.dbquery import run_sql
	from invenio.search_engine import get_record
	from invenio.search_engine import get_collection_reclist
	from invenio.refextract_api import get_pdf_doc
	from invenio.bibrecord import record_get_field_instances, \
	field_get_subfield_values


	-def task_run_core_wrapper(name, core_func, extra_vars=None):
	+def task_run_core_wrapper(name, core_func, extra_vars=None, post_process=None):
	def fun():
	try:
	- return task_run_core(name, core_func, extra_vars)
	+ return task_run_core(name, core_func,
	+ extra_vars=extra_vars,
	+ post_process=post_process)
	except Exception:
	# Remove extra '\n'
	write_message(traceback.format_exc()[:-1])
	raise
	return fun


	def fetch_last_updated(name):
	select_sql = "SELECT last_recid, last_updated FROM xtrJOB" \
	" WHERE name = %s LIMIT 1"
	row = run_sql(select_sql, (name,))
	if not row:
	sql = "INSERT INTO xtrJOB (name, last_updated, last_recid) " \
	"VALUES (%s, '1970-01-01', 0)"
	run_sql(sql, (name,))
	row = run_sql(select_sql, (name,))

	# Fallback in case we receive None instead of a valid date
	last_recid = row[0][0] or 0
	last_date = row[0][1] or datetime(year=1, month=1, day=1)

	return last_recid, last_date


	def store_last_updated(recid, creation_date, name):
	sql = "UPDATE xtrJOB SET last_recid = %s WHERE name=%s AND last_recid < %s"
	run_sql(sql, (recid, name, recid))
	sql = "UPDATE xtrJOB SET last_updated = %s " \
	"WHERE name=%s AND last_updated < %s"
	iso_date = creation_date.isoformat()
	run_sql(sql, (iso_date, name, iso_date))


	def fetch_concerned_records(name):
	task_update_progress("Fetching record ids")

	last_recid, last_date = fetch_last_updated(name)

	if task_get_option('new'):
	# Fetch all records inserted since last run
	sql = "SELECT `id`, `creation_date` FROM `bibrec` " \
	"WHERE `creation_date` >= %s " \
	"AND `id` > %s " \
	"ORDER BY `creation_date`"
	records = run_sql(sql, (last_date.isoformat(), last_recid))
	elif task_get_option('modified'):
	# Fetch all records inserted since last run
	sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
	"WHERE `modification_date` >= %s " \
	"AND `id` > %s " \
	"ORDER BY `modification_date`"
	records = run_sql(sql, (last_date.isoformat(), last_recid))
	else:
	given_recids = task_get_option('recids')
	for collection in task_get_option('collections'):
	given_recids.add(get_collection_reclist(collection))

	if given_recids:
	format_strings = ','.join(['%s'] * len(given_recids))
	records = run_sql("SELECT `id`, NULL FROM `bibrec` " \
	"WHERE `id` IN (%s) ORDER BY `id`" % format_strings,
	list(given_recids))
	else:
	records = []

	task_update_progress("Done fetching record ids")

	return records


	def fetch_concerned_arxiv_records(name):
	task_update_progress("Fetching arxiv record ids")

	dummy, last_date = fetch_last_updated(name)

	# Fetch all records inserted since last run
	sql = "SELECT `id`, `modification_date` FROM `bibrec` " \
	"WHERE `modification_date` >= %s " \
	"AND `creation_date` > NOW() - INTERVAL 7 DAY " \
	"ORDER BY `modification_date`" \
	"LIMIT 5000"
	records = run_sql(sql, [last_date.isoformat()])

	def check_arxiv(recid):
	record = get_record(recid)

	for report_tag in record_get_field_instances(record, "037"):
	for category in field_get_subfield_values(report_tag, 'a'):
	if category.startswith('arXiv'):
	return True
	return False

	def check_pdf_date(recid):
	doc = get_pdf_doc(recid)
	if doc:
	return doc.md > last_date
	return False

	records = [(r, mod_date) for r, mod_date in records if check_arxiv(r)]
	records = [(r, mod_date) for r, mod_date in records if check_pdf_date(r)]
	write_message("recids %s" % repr([(r, mod_date.isoformat()) \
	for r, mod_date in records]))
	task_update_progress("Done fetching arxiv record ids")
	return records


	def process_records(name, records, func, extra_vars):
	count = 1
	total = len(records)
	for recid, date in records:
	task_sleep_now_if_required(can_stop_too=True)
	msg = "Extracting for %s (%d/%d)" % (recid, count, total)
	task_update_progress(msg)
	write_message(msg)
	func(recid, **extra_vars)
	if date:
	store_last_updated(recid, date, name)
	count += 1


	-def task_run_core(name, func, extra_vars=None):
	+def task_run_core(name, func, extra_vars=None, post_process=None):
	"""Calls extract_references in refextract"""
	if task_get_option('task_specific_name'):
	name = "%s:%s" % (name, task_get_option('task_specific_name'))
	write_message("Starting %s" % name)

	if extra_vars is None:
	extra_vars = {}

	records = fetch_concerned_records(name)
	process_records(name, records, func, extra_vars)

	if task_get_option('arxiv'):
	extra_vars['_arxiv'] = True
	arxiv_name = "%s:arxiv" % name
	records = fetch_concerned_arxiv_records(arxiv_name)
	process_records(arxiv_name, records, func, extra_vars)

	+ if post_process:
	+ post_process(**extra_vars)
	+
	write_message("Complete")
	return True


	def split_ids(value):
	"""
	Split ids given in the command line
	Possible formats are:
	* 1
	* 1,2,3,4
	* 1-5,20,30,40
	Returns respectively
	* set([1])
	* set([1,2,3,4])
	* set([1,2,3,4,5,20,30,40])
	"""
	def parse(el):
	el = el.strip()
	if not el:
	ret = []
	elif '-' in el:
	start, end = el.split('-', 1)
	ret = xrange(int(start), int(end) + 1)
	else:
	ret = [int(el)]
	return ret
	return chain(*(parse(c) for c in value.split(',') if c.strip()))
	diff --git a/modules/docextract/lib/docextract_templates.py b/modules/docextract/lib/docextract_templates.py
	new file mode 100644
	index 000000000..2bdff506d
	--- /dev/null
	+++ b/modules/docextract/lib/docextract_templates.py
	@@ -0,0 +1,53 @@
	+# -- coding: utf-8 --
	+##
	+## This file is part of Invenio.
	+## Copyright (C) 2013 CERN.
	+##
	+## Invenio is free software; you can redistribute it and/or
	+## modify it under the terms of the GNU General Public License as
	+## published by the Free Software Foundation; either version 2 of the
	+## License, or (at your option) any later version.
	+##
	+## Invenio is distributed in the hope that it will be useful, but
	+## WITHOUT ANY WARRANTY; without even the implied warranty of
	+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	+## General Public License for more details.
	+##
	+## You should have received a copy of the GNU General Public License
	+## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	+## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	+
	+
	+"""DocExtract templates for the web API"""
	+
	+
	+class Template(object):
	+
	+ def tmpl_web_form(self):
	+ """Template for extraction page"""
	+ return """
	+ <style type="text/css">
	+ #extract_form input.urlinput { width: 600px; }
	+ #extract_form textarea { width: 500px; height: 500px; }
	+ </style>
	+
	+ <p>Please specify a pdf or a url or some references to parse</p>
	+
	+ <form action="" method="post" id="extract_form"
	+ enctype="multipart/form-data">
	+ <p>PDF: <input type="file" name="pdf" /></p>
	+ <p>arXiv: <input type="text" name="arxiv" /></p>
	+ <p>URL: <input type="text" name="url" class="urlinput" /></p>
	+ <textarea name="txt"></textarea>
	+ <p><input type="submit" /></p>
	+ </form>
	+ """
	+
	+ def tmpl_web_result(self, references_html):
	+ """Template header for extraction page result"""
	+ out = """
	+ <style type="text/css">
	+ #referenceinp_link { display: none; }
	+ </style>
	+ """
	+ return out + references_html
	diff --git a/modules/docextract/lib/docextract_webinterface.py b/modules/docextract/lib/docextract_webinterface.py
	index 4336ea0f1..57b23d0fb 100644
	--- a/modules/docextract/lib/docextract_webinterface.py
	+++ b/modules/docextract/lib/docextract_webinterface.py
	@@ -1,198 +1,183 @@
	- # -- coding: utf-8 --
	+# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""DocExtract REST and Web API

	Exposes document extration facilities to the world
	"""

	from tempfile import NamedTemporaryFile

	from invenio.webinterface_handler import WebInterfaceDirectory
	from invenio.webuser import collect_user_info
	from invenio.webpage import page
	from invenio.config import CFG_TMPSHAREDDIR, CFG_ETCDIR
	from invenio.refextract_api import extract_references_from_file_xml, \
	extract_references_from_url_xml, \
	extract_references_from_string_xml
	from invenio.bibformat_engine import format_record

	+import invenio.template
	+docextract_templates = invenio.template.load('docextract')
	+

	def check_login(req):
	"""Check that the user is logged in"""
	user_info = collect_user_info(req)
	if user_info['email'] == 'guest':
	# 1. User is guest: must login prior to upload
	# return 'Please login before uploading file.'
	pass


	def check_url(url):
	"""Check that the url we received is not gibberish"""
	return url.startswith('http://') or \
	url.startswith('https://') or \
	url.startswith('ftp://')


	def extract_from_pdf_string(pdf):
	"""Extract references from a pdf stored in a string

	Given a string representing a pdf, this function writes the string to
	disk and passes it to refextract.
	We need to create a temoporary file because we need to run pdf2text on it"""
	# Save new record to file
	tf = NamedTemporaryFile(prefix='docextract-pdf',
	dir=CFG_TMPSHAREDDIR)
	try:
	tf.write(pdf)
	tf.flush()
	refs = extract_references_from_file_xml(tf.name)
	finally:
	# Also deletes the file
	tf.close()

	return refs


	def make_arxiv_url(arxiv_id):
	"""Make a url we can use to download a pdf from arxiv

	Arguments:
	arxiv_id -- the arxiv id of the record to link to
	"""
	return "http://arxiv.org/pdf/%s.pdf" % arxiv_id


	class WebInterfaceAPIDocExtract(WebInterfaceDirectory):
	"""DocExtract REST API"""
	_exports = [
	('extract-references-pdf', 'extract_references_pdf'),
	('extract-references-pdf-url', 'extract_references_pdf_url'),
	('extract-references-txt', 'extract_references_txt'),
	]

	def extract_references_pdf(self, req, form):
	"""Extract references from uploaded pdf"""
	check_login(req)

	if 'pdf' not in form:
	return 'No PDF file uploaded'

	return extract_from_pdf_string(form['pdf'].file.read())

	def extract_references_pdf_url(self, req, form):
	"""Extract references from the pdf pointed by the passed url"""
	check_login(req)

	if 'url' not in form:
	return 'No URL specified'

	url = form['url'].value

	if not check_url(url):
	return 'Invalid URL specified'

	return extract_references_from_url_xml(url)

	def extract_references_txt(self, req, form):
	"""Extract references from plain text"""
	check_login(req)

	if 'txt' not in form:
	return 'No text specified'

	txt = form['txt'].value

	return extract_references_from_string_xml(txt)


	class WebInterfaceDocExtract(WebInterfaceDirectory):
	"""DocExtract API"""
	_exports = ['api',
	('', 'extract'),
	('example.pdf', 'example_pdf'),
	]

	api = WebInterfaceAPIDocExtract()

	def example_pdf(self, req, _form):
	"""Serve a test pdf for tests"""
	f = open("%s/docextract/example.pdf" % CFG_ETCDIR, 'rb')
	try:
	req.write(f.read())
	finally:
	f.close()

	- def extract_template(self):
	- """Template for reference extraction page"""
	- return """Please specify a pdf or a url or some references to parse
	-
	- <form action="" method="post"
	- enctype="multipart/form-data">
	- <p>PDF: <input type="file" name="pdf" /></p>
	- <p>arXiv: <input type="text" name="arxiv" /></p>
	- <p>URL: <input type="text" name="url" style="width: 600px;"/></p>
	- <textarea name="txt" style="width: 500px; height: 500px;"></textarea>
	- <p><input type="submit" /></p>
	- </form>
	- """
	-
	def extract(self, req, form):
	"""Refrences extraction page

	This page can be used for authors to test their pdfs against our
	refrences extraction process"""
	user_info = collect_user_info(req)

	# Handle the 3 POST parameters
	if 'pdf' in form and form['pdf'].value:
	pdf = form['pdf'].value
	references_xml = extract_from_pdf_string(pdf)
	elif 'arxiv' in form and form['arxiv'].value:
	url = make_arxiv_url(arxiv_id=form['arxiv'].value)
	references_xml = extract_references_from_url_xml(url)
	elif 'url' in form and form['url'].value:
	url = form['url'].value
	references_xml = extract_references_from_url_xml(url)
	elif 'txt' in form and form['txt'].value:
	txt = form['txt'].value.decode('utf-8', errors='ignore')
	references_xml = extract_references_from_string_xml(txt)
	else:
	references_xml = None

	# If we have not uploaded anything yet
	# Display the form that allows us to do so
	if not references_xml:
	- out = self.extract_template()
	+ out = docextract_templates.tmpl_web_form()
	else:
	- out = """
	- <style type="text/css">
	- #referenceinp_link { display: none; }
	- </style>
	- """
	- out += format_record(0,
	- 'hdref',
	- xml_record=references_xml,
	- user_info=user_info)
	+ references_html = format_record(0,
	+ 'hdref',
	+ xml_record=references_xml,
	+ user_info=user_info)
	+ out = docextract_templates.tmpl_web_result(references_html)

	# Render the page (including header, footer)
	return page(title='References Extractor',
	body=out,
	uid=user_info['uid'],
	req=req)
	diff --git a/modules/docextract/lib/refextract_api.py b/modules/docextract/lib/refextract_api.py
	index 564fc6ee4..cf709fc46 100644
	--- a/modules/docextract/lib/refextract_api.py
	+++ b/modules/docextract/lib/refextract_api.py
	@@ -1,299 +1,330 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""This is where all the public API calls are accessible

	This is the only file containing public calls and everything that is
	present here can be considered private by the invenio modules.
	"""


	import os

	from urllib import urlretrieve
	from tempfile import mkstemp

	from invenio.refextract_engine import parse_references, \
	get_plaintext_document_body, \
	parse_reference_line, \
	get_kbs
	from invenio.refextract_text import extract_references_from_fulltext
	from invenio.search_engine_utils import get_fieldvalues
	from invenio.bibindex_tokenizers.BibIndexJournalTokenizer import \
	CFG_JOURNAL_PUBINFO_STANDARD_FORM, \
	CFG_JOURNAL_TAG
	from invenio.bibdocfile import BibRecDocs, InvenioBibDocFileError
	from invenio.search_engine import get_record
	from invenio.bibtask import task_low_level_submission
	from invenio.bibrecord import record_delete_fields, record_xml_output, \
	create_record, record_get_field_instances, record_add_fields, \
	record_has_field
	from invenio.refextract_find import get_reference_section_beginning, \
	find_numeration_in_body
	from invenio.refextract_text import rebuild_reference_lines
	from invenio.refextract_config import CFG_REFEXTRACT_FILENAME
	from invenio.config import CFG_TMPSHAREDDIR


	class FullTextNotAvailable(Exception):
	"""Raised when we cannot access the document text"""


	class RecordHasReferences(Exception):
	"""Raised when
	* we asked to updated references for a record
	* we explicitely asked for not overwriting references for this record
	(via the appropriate function argument)
	* the record has references thus we cannot update them
	"""


	def extract_references_from_url_xml(url):
	"""Extract references from the pdf specified in the url

	The single parameter is the path to the pdf.
	It raises FullTextNotAvailable if the url gives a 404
	The result is given in marcxml.
	"""
	filename, dummy = urlretrieve(url)
	try:
	try:
	marcxml = extract_references_from_file_xml(filename)
	except IOError, err:
	if err.code == 404:
	raise FullTextNotAvailable()
	else:
	raise
	finally:
	os.remove(filename)
	return marcxml


	def extract_references_from_file_xml(path, recid=None):
	"""Extract references from a local pdf file

	The single parameter is the path to the file
	It raises FullTextNotAvailable if the file does not exist
	The result is given in marcxml.
	"""
	return extract_references_from_file(path=path, recid=recid).to_xml()


	def extract_references_from_file(path, recid=None):
	"""Extract references from a local pdf file

	The single parameter is the path to the file
	It raises FullTextNotAvailable if the file does not exist
	The result is given as a bibrecord class.
	"""
	if not os.path.isfile(path):
	raise FullTextNotAvailable()

	docbody, dummy = get_plaintext_document_body(path)
	reflines, dummy, dummy = extract_references_from_fulltext(docbody)
	if not len(reflines):
	docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
	reflines, dummy, dummy = extract_references_from_fulltext(docbody)

	return parse_references(reflines, recid=recid)


	def extract_references_from_string_xml(source,
	is_only_references=True,
	recid=None):
	"""Extract references from a string

	The single parameter is the document
	The result is given as a bibrecord class.
	"""
	r = extract_references_from_string(source=source,
	is_only_references=is_only_references,
	recid=recid)
	return r.to_xml()


	def extract_references_from_string(source,
	is_only_references=True,
	recid=None):
	"""Extract references from a string

	The single parameter is the document
	The result is given in marcxml.
	"""
	docbody = source.split('\n')
	if not is_only_references:
	reflines, dummy, dummy = extract_references_from_fulltext(docbody)
	else:
	refs_info = get_reference_section_beginning(docbody)
	if not refs_info:
	refs_info, dummy = find_numeration_in_body(docbody)
	refs_info['start_line'] = 0
	refs_info['end_line'] = len(docbody) - 1,

	reflines = rebuild_reference_lines(docbody, refs_info['marker_pattern'])
	return parse_references(reflines, recid=recid)


	-def extract_references_from_record_xml(recid):
	+def extract_references_from_record(recid):
	"""Extract references from a record id

	The single parameter is the document
	The result is given in marcxml.
	"""
	path = look_for_fulltext(recid)
	if not path:
	raise FullTextNotAvailable()

	- return extract_references_from_file_xml(path, recid=recid)
	+ return extract_references_from_file(path, recid=recid)
	+
	+
	+def extract_references_from_record_xml(recid):
	+ """Extract references from a record id
	+
	+ The single parameter is the document
	+ The result is given in marcxml.
	+ """
	+ return extract_references_from_record(recid).to_xml()


	def replace_references(recid):
	"""Replace references for a record

	The record itself is not updated, the marc xml of the document with updated
	references is returned

	Parameters:
	* recid: the id of the record
	"""
	# Parse references
	references_xml = extract_references_from_record_xml(recid)
	references = create_record(references_xml)
	# Record marc xml
	record = get_record(recid)

	if references[0]:
	fields_to_add = record_get_field_instances(references[0],
	tag='999',
	ind1='%',
	ind2='%')
	# Replace 999 fields
	record_delete_fields(record, '999')
	record_add_fields(record, '999', fields_to_add)
	# Update record references
	out_xml = record_xml_output(record)
	else:
	out_xml = None

	return out_xml


	def update_references(recid, overwrite=True):
	"""Update references for a record

	First, we extract references from a record.
	Then, we are not updating the record directly but adding a bibupload
	task in -c mode which takes care of updating the record.

	Parameters:
	* recid: the id of the record
	"""

	if not overwrite:
	# Check for references in record
	record = get_record(recid)
	if record and record_has_field(record, '999'):
	raise RecordHasReferences('Record has references and overwrite '
	'mode is disabled: %s' % recid)

	if get_fieldvalues(recid, '999C59'):
	raise RecordHasReferences('Record has been curated: %s' % recid)

	# Parse references
	references_xml = extract_references_from_record_xml(recid)

	# Save new record to file
	(temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME,
	dir=CFG_TMPSHAREDDIR)
	temp_file = os.fdopen(temp_fd, 'w')
	temp_file.write(references_xml)
	temp_file.close()

	# Update record
	task_low_level_submission('bibupload', 'refextract', '-P', '5',
	'-c', temp_path)


	def list_pdfs(recid):
	rec_info = BibRecDocs(recid)
	docs = rec_info.list_bibdocs()

	for doc in docs:
	for ext in ('pdf', 'pdfa', 'PDF'):
	try:
	yield doc.get_file(ext)
	except InvenioBibDocFileError:
	pass


	def get_pdf_doc(recid):
	try:
	doc = list_pdfs(recid).next()
	except StopIteration:
	doc = None

	return doc


	def look_for_fulltext(recid):
	doc = get_pdf_doc(recid)

	path = None
	if doc:
	path = doc.get_full_path()

	return path


	def record_has_fulltext(recid):
	"""Checks if we can access the fulltext for the given recid"""
	path = look_for_fulltext(recid)
	return path is not None


	def search_from_reference(text):
	"""Convert a raw reference to a search query

	Called by the search engine to convert a raw reference:
	find rawref John, JINST 4 (1994) 45
	is converted to
	journal:"JINST,4,45"
	"""
	field = ''
	pattern = ''

	kbs = get_kbs()
	references, dummy_m, dummy_c, dummy_co = parse_reference_line(text, kbs)

	for elements in references:
	for el in elements:
	if el['type'] == 'JOURNAL':
	field = 'journal'
	pattern = CFG_JOURNAL_PUBINFO_STANDARD_FORM \
	.replace(CFG_JOURNAL_TAG.replace('%', 'p'), el['title']) \
	.replace(CFG_JOURNAL_TAG.replace('%', 'v'), el['volume']) \
	.replace(CFG_JOURNAL_TAG.replace('%', 'c'), el['page']) \
	.replace(CFG_JOURNAL_TAG.replace('%', 'y'), el['year'])
	break
	elif el['type'] == 'REPORTNUMBER':
	field = 'report'
	pattern = el['report_num']
	break

	return field, pattern.encode('utf-8')
	+
	+
	+def check_record_for_refextract(recid):
	+ if get_fieldvalues(recid, '999C6v'):
	+ # References extracted by refextract
	+ if get_fieldvalues(recid, '999C59'):
	+ # They have been curated
	+ # To put in the HP and create ticket in the future
	+ needs_submitting = False
	+ else:
	+ # They haven't been curated, we safely extract from the new pdf
	+ needs_submitting = True
	+ elif not get_fieldvalues(recid, '999C5_'):
	+ # No references in the record, we can safely extract
	+ # new references
	+ needs_submitting = True
	+ else:
	+ # Old record, with either no curated references or references
	+ # curated by SLAC. We cannot distinguish, so we do nothing
	+ needs_submitting = False
	+
	+ return needs_submitting
	diff --git a/modules/docextract/lib/refextract_config.py b/modules/docextract/lib/refextract_config.py
	index a4dcb4ad4..b32bcbb89 100644
	--- a/modules/docextract/lib/refextract_config.py
	+++ b/modules/docextract/lib/refextract_config.py
	@@ -1,127 +1,127 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""RefExtract configuration"""


	from invenio.config import CFG_VERSION, CFG_ETCDIR

	# pylint: disable=C0301

	-CFG_REFEXTRACT_VERSION_NUM = '1.5.32'
	+CFG_REFEXTRACT_VERSION_NUM = '1.5.35'
	# Version number:
	CFG_REFEXTRACT_VERSION = "Invenio/%s refextract/%s" \
	% (CFG_VERSION, CFG_REFEXTRACT_VERSION_NUM)
	# Module config directory
	CFG_CONF_DIR = '%s/docextract' % CFG_ETCDIR

	CFG_REFEXTRACT_KBS = {
	'journals' : "%s/journal-titles.kb" % CFG_CONF_DIR,
	'journals-re' : "%s/journal-titles-re.kb" % CFG_CONF_DIR,
	'report-numbers' : "%s/report-numbers.kb" % CFG_CONF_DIR,
	'authors' : "%s/authors.kb" % CFG_CONF_DIR,
	'collaborations' : "%s/collaborations.kb" % CFG_CONF_DIR,
	'books' : "%s/books.kb" % CFG_CONF_DIR,
	'conferences' : "%s/conferences.kb" % CFG_CONF_DIR,
	'publishers' : "%s/publishers.kb" % CFG_CONF_DIR,
	'special-journals': "%s/special-journals.kb" % CFG_CONF_DIR,
	}

	# Prefix for temp files
	CFG_REFEXTRACT_FILENAME = "refextract"

	## MARC Fields and subfields used by refextract:

	# Reference fields:
	CFG_REFEXTRACT_FIELDS = {
	'misc': 'm',
	'linemarker': 'o',
	'doi': 'a',
	'reportnumber': 'r',
	'journal': 's',
	'url': 'u',
	'urldesc': 'z',
	'author': 'h',
	'title': 't',
	'isbn': 'i',
	'publisher': 'p',
	'year': 'y',
	'collaboration': 'c',
	'recid': '0',
	}

	CFG_REFEXTRACT_TAG_ID_REFERENCE = "999" # ref field tag
	CFG_REFEXTRACT_IND1_REFERENCE = "C" # ref field ind1
	CFG_REFEXTRACT_IND2_REFERENCE = "5" # ref field ind2

	## refextract statistics fields:
	CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS = "999C6" # ref-stats tag

	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS = "a" # ref-stats subfield
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME = "t" # ref-stats time subfield
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION = "v" # ref-stats version subfield
	## Internal tags are used by refextract to mark-up recognised citation
	## information.
	CFG_REFEXTRACT_MARKER_OPENING_REPORT_NUM = r"<cds.REPORTNUMBER>"
	CFG_REFEXTRACT_MARKER_OPENING_TITLE = r"<cds.JOURNAL>"
	CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID = r"<cds.JOURNALibid>"
	CFG_REFEXTRACT_MARKER_OPENING_SERIES = r"<cds.SER>"
	CFG_REFEXTRACT_MARKER_OPENING_VOLUME = r"<cds.VOL>"
	CFG_REFEXTRACT_MARKER_OPENING_YEAR = r"<cds.YR>"
	CFG_REFEXTRACT_MARKER_OPENING_PAGE = r"<cds.PG>"
	CFG_REFEXTRACT_MARKER_OPENING_QUOTED = r"<cds.QUOTED>"
	CFG_REFEXTRACT_MARKER_OPENING_ISBN = r"<cds.ISBN>"
	CFG_REFEXTRACT_MARKER_OPENING_PUBLISHER = r"<cds.PUBLISHER>"
	CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION = r"<cds.COLLABORATION>"

	# These are the "closing tags:
	CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM = r"</cds.REPORTNUMBER>"
	CFG_REFEXTRACT_MARKER_CLOSING_TITLE = r"</cds.JOURNAL>"
	CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID = r"</cds.JOURNALibid>"
	CFG_REFEXTRACT_MARKER_CLOSING_SERIES = r"</cds.SER>"
	CFG_REFEXTRACT_MARKER_CLOSING_VOLUME = r"</cds.VOL>"
	CFG_REFEXTRACT_MARKER_CLOSING_YEAR = r"</cds.YR>"
	CFG_REFEXTRACT_MARKER_CLOSING_PAGE = r"</cds.PG>"
	CFG_REFEXTRACT_MARKER_CLOSING_QUOTED = r"</cds.QUOTED>"
	CFG_REFEXTRACT_MARKER_CLOSING_ISBN = r"</cds.ISBN>"
	CFG_REFEXTRACT_MARKER_CLOSING_PUBLISHER = r"</cds.PUBLISHER>"
	CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION = r"</cds.COLLABORATION>"

	## Of the form '</cds.AUTHxxxx>' only
	CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND = r"</cds.AUTHstnd>"
	CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL = r"</cds.AUTHetal>"
	CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL = r"</cds.AUTHincl>"

	## The minimum length of a reference's misc text to be deemed insignificant.
	## when comparing misc text with semi-colon defined sub-references.
	## Values higher than this value reflect meaningful misc text.
	## Hence, upon finding a correct semi-colon, but having current misc text
	## length less than this value (without other meaningful reference objects:
	## report numbers, titles...) then no split will occur.
	## (A higher value will increase splitting strictness. i.e. Fewer splits)
	CGF_REFEXTRACT_SEMI_COLON_MISC_TEXT_SENSITIVITY = 60

	## The length of misc text between two adjacent authors which is
	## deemed as insignificant. As such, when misc text of a length less
	## than this value is found, then the latter author group is dumped into misc.
	## (A higher value will increase splitting strictness. i.e. Fewer splits)
	CGF_REFEXTRACT_ADJACENT_AUTH_MISC_SEPARATION = 10

	## Maximum number of lines for a citation before it is considered invalid
	CFG_REFEXTRACT_MAX_LINES = 25
	diff --git a/modules/docextract/lib/refextract_kbs.py b/modules/docextract/lib/refextract_kbs.py
	index aa4af2338..6b757ad64 100644
	--- a/modules/docextract/lib/refextract_kbs.py
	+++ b/modules/docextract/lib/refextract_kbs.py
	@@ -1,757 +1,787 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	import re
	import sys
	import csv
	-
	try:
	import hashlib
	md5 = hashlib.md5
	except ImportError:
	from md5 import new as md5

	from invenio.refextract_config import CFG_REFEXTRACT_KBS
	from invenio.bibknowledge import get_kbr_items
	from invenio.config import CFG_REFEXTRACT_KBS_OVERRIDE
	from invenio.refextract_re import re_kb_line, \
	re_regexp_character_class, \
	re_report_num_chars_to_escape, \
	re_extract_quoted_text, \
	re_extract_char_class, \
	re_punctuation
	from invenio.docextract_utils import write_message
	from invenio.docextract_text import re_group_captured_multiple_space
	+from invenio.search_engine import get_collection_reclist
	+from invenio.search_engine_utils import get_fieldvalues


	def get_kbs(custom_kbs_files=None, cache=None):
	"""Load kbs (with caching)

	This function stores the loaded kbs into the cache variable
	For the caching to work, it needs to receive an empty dictionary
	as "cache" paramater.
	"""
	if not cache:
	cache = {}

	cache_key = make_cache_key(custom_kbs_files)
	if cache_key not in cache:
	# Build paths from defaults and specified ones
	kbs_files = CFG_REFEXTRACT_KBS.copy()
	for key, path in CFG_REFEXTRACT_KBS_OVERRIDE.items():
	kbs_files[key] = path
	if custom_kbs_files:
	for key, path in custom_kbs_files.items():
	if path:
	kbs_files[key] = path
	# Loads kbs from those paths
	cache[cache_key] = load_kbs(kbs_files)
	return cache[cache_key]


	def load_kbs(kbs_files):
	"""Load kbs (without caching)

	Args:
	- kb_files: list of custom paths you can specify to override the
	default values
	If path starts with "kb:", the kb will be loaded from the database
	"""
	return {
	'journals_re': build_journals_re_kb(kbs_files['journals-re']),
	'journals': load_kb(kbs_files['journals'], build_journals_kb),
	'report-numbers': build_reportnum_kb(kbs_files['report-numbers']),
	'authors': build_authors_kb(kbs_files['authors']),
	'books': build_books_kb(kbs_files['books']),
	'publishers': load_kb(kbs_files['publishers'], build_publishers_kb),
	'special_journals': build_special_journals_kb(kbs_files['special-journals']),
	'collaborations': load_kb(kbs_files['collaborations'], build_collaborations_kb),
	}


	def load_kb(path, builder):
	try:
	path.startswith
	except AttributeError:
	write_message("Loading kb from array", verbose=3)
	return load_kb_from_iterable(path, builder)
	else:
	write_message("Loading kb from %s" % path, verbose=3)
	kb_start = 'kb:'
	+ records_start = 'records:'
	if path.startswith(kb_start):
	return load_kb_from_db(path[len(kb_start):], builder)
	+ elif path.startswith(records_start):
	+ return load_kb_from_records(path[len(kb_start):], builder)
	else:
	return load_kb_from_file(path, builder)


	def make_cache_key(custom_kbs_files=None):
	"""Create cache key for kbs caches instances

	This function generates a unique key for a given set of arguments.

	The files dictionary is transformed like this:
	{'journal': '/var/journal.kb', 'books': '/var/books.kb'}
	to
	"journal=/var/journal.kb;books=/var/books.kb"

	Then _inspire is appended if we are an INSPIRE site.
	"""
	if custom_kbs_files:
	serialized_args = ('%s=%s' % v for v in custom_kbs_files.iteritems())
	serialized_args = ';'.join(serialized_args)
	else:
	serialized_args = "default"
	cache_key = md5(serialized_args).digest()
	return cache_key


	def order_reportnum_patterns_bylen(numeration_patterns):
	"""Given a list of user-defined patterns for recognising the numeration
	styles of an institute's preprint references, for each pattern,
	strip out character classes and record the length of the pattern.
	Then add the length and the original pattern (in a tuple) into a new
	list for these patterns and return this list.
	@param numeration_patterns: (list) of strings, whereby each string is
	a numeration pattern.
	@return: (list) of tuples, where each tuple contains a pattern and
	its length.
	"""
	def _compfunc_bylen(a, b):
	"""Compares regexp patterns by the length of the pattern-text.
	"""
	if a[0] < b[0]:
	return 1
	elif a[0] == b[0]:
	return 0
	else:
	return -1
	pattern_list = []
	for pattern in numeration_patterns:
	base_pattern = re_regexp_character_class.sub('1', pattern)
	pattern_list.append((len(base_pattern), pattern))
	pattern_list.sort(_compfunc_bylen)
	return pattern_list


	def create_institute_numeration_group_regexp_pattern(patterns):
	"""Using a list of regexp patterns for recognising numeration patterns
	for institute preprint references, ordered by length - longest to
	shortest - create a grouped 'OR' or of these patterns, ready to be
	used in a bigger regexp.
	@param patterns: (list) of strings. All of the numeration regexp
	patterns for recognising an institute's preprint reference styles.
	@return: (string) a grouped 'OR' regexp pattern of the numeration
	patterns. E.g.:
	(?P<num>[12]\d{3} \d\d\d\|\d\d \d\d\d\|[A-Za-z] \d\d\d)
	"""
	- grouped_numeration_pattern = u""
	- if len(patterns) > 0:
	- grouped_numeration_pattern = u"(?P<numn>"
	- for pattern in patterns:
	- grouped_numeration_pattern += \
	- institute_num_pattern_to_regex(pattern[1]) + u"\|"
	- grouped_numeration_pattern = \
	- grouped_numeration_pattern[0:len(grouped_numeration_pattern) - 1]
	- grouped_numeration_pattern += u")"
	+ patterns_list = [institute_num_pattern_to_regex(p[1]) for p in patterns]
	+ grouped_numeration_pattern = u"(?P<numn>%s)" % u'\|'.join(patterns_list)
	return grouped_numeration_pattern


	def institute_num_pattern_to_regex(pattern):
	"""Given a numeration pattern from the institutes preprint report
	numbers KB, convert it to turn it into a regexp string for
	recognising such patterns in a reference line.
	Change:
	\ -> \\
	9 -> \d
	a -> [A-Za-z]
	v -> [Vv] # Tony for arXiv vN
	mm -> (0[1-9]\|1[0-2])
	yy -> \d{2}
	yyyy -> [12]\d{3}
	/ -> \/
	s -> \s*
	@param pattern: (string) a user-defined preprint reference numeration
	pattern.
	@return: (string) the regexp for recognising the pattern.
	"""
	simple_replacements = [
	('9', r'\d'),
	('9+', r'\d+'),
	('w+', r'\w+'),
	('a', r'[A-Za-z]'),
	('v', r'[Vv]'),
	('mm', r'(0[1-9]\|1[0-2])'),
	('yyyy', r'[12]\d{3}'),
	('yy', r'\d\d'),
	('s', r'\s*'),
	(r'/', r'\/')]
	# first, escape certain characters that could be sensitive to a regexp:
	pattern = re_report_num_chars_to_escape.sub(r'\\\g<1>', pattern)

	# now loop through and carry out the simple replacements:
	for repl in simple_replacements:
	pattern = pattern.replace(repl[0], repl[1])

	# now replace a couple of regexp-like paterns:
	# quoted string with non-quoted version ("hello" with hello);
	# Replace / [abcd ]/ with /( [abcd])?/ :
	pattern = re_extract_quoted_text[0].sub(re_extract_quoted_text[1],
	pattern)
	pattern = re_extract_char_class[0].sub(re_extract_char_class[1],
	pattern)

	# the pattern has been transformed
	return pattern


	def build_reportnum_kb(fpath):
	"""Given the path to a knowledge base file containing the details
	of institutes and the patterns that their preprint report
	numbering schemes take, create a dictionary of regexp search
	patterns to recognise these preprint references in reference
	lines, and a dictionary of replacements for non-standard preprint
	categories in these references.

	The knowledge base file should consist only of lines that take one
	of the following 3 formats:

	#####Institute Name####

	(the name of the institute to which the preprint reference patterns
	belong, e.g. '#####LANL#####', surrounded by 5 # on either side.)

	<pattern>

	(numeration patterns for an institute's preprints, surrounded by
	< and >.)

	seek-term --- replace-term
	(i.e. a seek phrase on the left hand side, a replace phrase on the
	right hand side, with the two phrases being separated by 3 hyphens.)
	E.g.:
	ASTRO PH ---astro-ph

	The left-hand side term is a non-standard version of the preprint
	reference category; the right-hand side term is the standard version.

	If the KB file cannot be read from, or an unexpected line is
	encountered in the KB, an error message is output to standard error
	and execution is halted with an error-code 0.

	@param fpath: (string) the path to the knowledge base file.
	@return: (tuple) containing 2 dictionaries. The first contains regexp
	search patterns used to identify preprint references in a line. This
	dictionary is keyed by a tuple containing the line number of the
	pattern in the KB and the non-standard category string.
	E.g.: (3, 'ASTRO PH').
	The second dictionary contains the standardised category string,
	and is keyed by the non-standard category string. E.g.: 'astro-ph'.
	"""
	def _add_institute_preprint_patterns(preprint_classifications,
	preprint_numeration_ptns,
	preprint_reference_search_regexp_patterns,
	standardised_preprint_reference_categories,
	kb_line_num):
	"""For a list of preprint category strings and preprint numeration
	patterns for a given institute, create the regexp patterns for
	each of the preprint types. Add the regexp patterns to the
	dictionary of search patterns
	(preprint_reference_search_regexp_patterns), keyed by the line
	number of the institute in the KB, and the preprint category
	search string. Also add the standardised preprint category string
	to another dictionary, keyed by the line number of its position
	in the KB and its non-standardised version.
	@param preprint_classifications: (list) of tuples whereby each tuple
	contains a preprint category search string and the line number of
	the name of institute to which it belongs in the KB.
	E.g.: (45, 'ASTRO PH').
	@param preprint_numeration_ptns: (list) of preprint reference
	numeration search patterns (strings)
	@param preprint_reference_search_regexp_patterns: (dictionary) of
	regexp patterns used to search in document lines.
	@param standardised_preprint_reference_categories: (dictionary)
	containing the standardised strings for preprint reference
	categories. (E.g. 'astro-ph'.)
	@param kb_line_num: (integer) - the line number int the KB at
	which a given institute name was found.
	@return: None
	"""
	if preprint_classifications and preprint_numeration_ptns:
	# the previous institute had both numeration styles and categories
	# for preprint references.
	# build regexps and add them for this institute:
	# First, order the numeration styles by line-length, and build a
	# grouped regexp for recognising numeration:
	ordered_patterns = \
	order_reportnum_patterns_bylen(preprint_numeration_ptns)
	# create a grouped regexp for numeration part of
	# preprint reference:
	numeration_regexp = \
	create_institute_numeration_group_regexp_pattern(ordered_patterns)

	# for each "classification" part of preprint references, create a
	# complete regex:
	# will be in the style "(categ)-(numatn1\|numatn2\|numatn3\|...)"
	for classification in preprint_classifications:
	- search_pattern_str = ur'(?:^\|[^a-zA-Z0-9\/\.\-])((?P<categ>' \
	+ search_pattern_str = ur'(?:^\|[^a-zA-Z0-9\/\.\-])([\[\(]?(?P<categ>' \
	+ classification[0].strip() + u')' \
	- + numeration_regexp + u')'
	+ + numeration_regexp + u'[\]\)]?)'

	re_search_pattern = re.compile(search_pattern_str,
	re.UNICODE)
	preprint_reference_search_regexp_patterns[(kb_line_num,
	classification[0])] =\
	re_search_pattern
	standardised_preprint_reference_categories[(kb_line_num,
	classification[0])] =\
	classification[1]

	preprint_reference_search_regexp_patterns = {} # a dictionary of patterns
	# used to recognise
	# categories of preprints
	# as used by various
	# institutes
	standardised_preprint_reference_categories = {} # dictionary of
	# standardised category
	# strings for preprint cats
	current_institute_preprint_classifications = [] # list of tuples containing
	# preprint categories in
	# their raw & standardised
	# forms, as read from KB
	current_institute_numerations = [] # list of preprint
	# numeration patterns, as
	# read from the KB

	# pattern to recognise an institute name line in the KB
	re_institute_name = re.compile(ur'^\{5}\s(.+)\s\{5}$', re.UNICODE)

	# pattern to recognise an institute preprint categ line in the KB
	re_preprint_classification = \
	re.compile(ur'^\s(\w.)\s---\s(\w.)\s$', re.UNICODE)

	# pattern to recognise a preprint numeration-style line in KB
	re_numeration_pattern = re.compile(ur'^\<(.+)\>$', re.UNICODE)

	kb_line_num = 0 # when making the dictionary of patterns, which is
	# keyed by the category search string, this counter
	# will ensure that patterns in the dictionary are not
	# overwritten if 2 institutes have the same category
	# styles.

	try:
	if isinstance(fpath, basestring):
	write_message('Loading reports kb from %s' % fpath, verbose=3)
	fh = open(fpath, "r")
	fpath_needs_closing = True
	else:
	fpath_needs_closing = False
	fh = fpath

	for rawline in fh:
	if rawline.startswith('#'):
	continue

	kb_line_num += 1
	try:
	rawline = rawline.decode("utf-8")
	except UnicodeError:
	write_message("*** Unicode problems in %s for line %e"
	% (fpath, kb_line_num), sys.stderr, verbose=0)
	raise UnicodeError("Error: Unable to parse report number kb (line: %s)" % str(kb_line_num))

	m_institute_name = re_institute_name.search(rawline)
	if m_institute_name:
	# This KB line is the name of an institute
	# append the last institute's pattern list to the list of
	# institutes:
	_add_institute_preprint_patterns(current_institute_preprint_classifications,
	current_institute_numerations,
	preprint_reference_search_regexp_patterns,
	standardised_preprint_reference_categories,
	kb_line_num)

	# Now start a new dictionary to contain the search patterns
	# for this institute:
	current_institute_preprint_classifications = []
	current_institute_numerations = []
	# move on to the next line
	continue

	m_preprint_classification = \
	re_preprint_classification.search(rawline)
	if m_preprint_classification:
	# This KB line contains a preprint classification for
	# the current institute
	try:
	current_institute_preprint_classifications.append((m_preprint_classification.group(1),
	m_preprint_classification.group(2)))
	except (AttributeError, NameError):
	# didn't match this line correctly - skip it
	pass
	# move on to the next line
	continue

	m_numeration_pattern = re_numeration_pattern.search(rawline)
	if m_numeration_pattern:
	# This KB line contains a preprint item numeration pattern
	# for the current institute
	try:
	current_institute_numerations.append(m_numeration_pattern.group(1))
	except (AttributeError, NameError):
	# didn't match the numeration pattern correctly - skip it
	pass
	continue

	_add_institute_preprint_patterns(current_institute_preprint_classifications,
	current_institute_numerations,
	preprint_reference_search_regexp_patterns,
	standardised_preprint_reference_categories,
	kb_line_num)
	if fpath_needs_closing:
	write_message('Loaded reports kb', verbose=3)
	fh.close()
	except IOError:
	# problem opening KB for reading, or problem while reading from it:
	emsg = """Error: Could not build knowledge base containing """ \
	"""institute preprint referencing patterns - failed """ \
	"""to read from KB %(kb)s.""" \
	% {'kb' : fpath}
	write_message(emsg, sys.stderr, verbose=0)
	raise IOError("Error: Unable to open report number kb '%s'" % fpath)

	# return the preprint reference patterns and the replacement strings
	# for non-standard categ-strings:
	return (preprint_reference_search_regexp_patterns,
	standardised_preprint_reference_categories)


	def _cmp_bystrlen_reverse(a, b):
	"""A private "cmp" function to be used by the "sort" function of a
	list when ordering the titles found in a knowledge base by string-
	length - LONGEST -> SHORTEST.
	@param a: (string)
	@param b: (string)
	@return: (integer) - 0 if len(a) == len(b); 1 if len(a) < len(b);
	-1 if len(a) > len(b);
	"""
	if len(a) > len(b):
	return -1
	elif len(a) < len(b):
	return 1
	else:
	return 0


	def build_special_journals_kb(fpath):
	"""Load special journals database from file

	Special journals are journals that have a volume which is not unique
	among different years. To keep the volume unique we are adding the year
	before the volume.
	"""
	journals = set()
	write_message('Loading special journals kb from %s' % fpath, verbose=3)
	fh = open(fpath, "r")
	try:
	for line in fh:
	# Skip commented lines
	if line.startswith('#'):
	continue
	# Skip empty line
	if not line.strip():
	continue
	journals.add(line.strip())
	finally:
	fh.close()
	write_message('Loaded special journals kb', verbose=3)

	return journals


	def build_books_kb(fpath):
	if isinstance(fpath, basestring):
	fpath_needs_closing = True
	try:
	write_message('Loading books kb from %s' % fpath, verbose=3)
	fh = open(fpath, "r")
	source = csv.reader(fh, delimiter='\|', lineterminator=';')
	except IOError:
	# problem opening KB for reading, or problem while reading from it:
	emsg = "Error: Could not build list of books - failed " \
	"to read from KB %(kb)s." % {'kb' : fpath}
	raise IOError(emsg)
	else:
	fpath_needs_closing = False
	source = fpath

	try:
	books = {}
	for line in source:
	try:
	books[line[1].upper()] = line
	except IndexError:
	write_message('Invalid line in books kb %s' % line, verbose=1)
	finally:
	if fpath_needs_closing:
	fh.close()
	write_message('Loaded books kb', verbose=3)

	return books


	def build_publishers_kb(fpath):
	if isinstance(fpath, basestring):
	fpath_needs_closing = True
	try:
	write_message('Loading publishers kb from %s' % fpath, verbose=3)
	fh = open(fpath, "r")
	source = csv.reader(fh, delimiter='\|', lineterminator='\n')
	except IOError:
	# problem opening KB for reading, or problem while reading from it:
	emsg = "Error: Could not build list of publishers - failed " \
	"to read from KB %(kb)s." % {'kb' : fpath}
	raise IOError(emsg)
	else:
	fpath_needs_closing = False
	source = fpath

	try:
	publishers = {}
	for line in source:
	try:
	pattern = re.compile(ur'(\b\|^)%s(\b\|$)' % line[0], re.I\|re.U)
	publishers[line[0]] = {'pattern': pattern, 'repl': line[1]}
	except IndexError:
	write_message('Invalid line in books kb %s' % line, verbose=1)
	finally:
	if fpath_needs_closing:
	fh.close()
	write_message('Loaded publishers kb', verbose=3)

	return publishers


	def build_authors_kb(fpath):
	replacements = []

	if isinstance(fpath, basestring):
	fpath_needs_closing = True
	try:
	fh = open(fpath, "r")
	except IOError:
	# problem opening KB for reading, or problem while reading from it:
	emsg = "Error: Could not build list of authors - failed " \
	"to read from KB %(kb)s." % {'kb' : fpath}
	write_message(emsg, sys.stderr, verbose=0)
	raise IOError("Error: Unable to open authors kb '%s'" % fpath)
	else:
	fpath_needs_closing = False
	fh = fpath

	try:
	for rawline in fh:
	if rawline.startswith('#'):
	continue

	# Extract the seek->replace terms from this KB line:
	m_kb_line = re_kb_line.search(rawline.decode('utf-8'))
	if m_kb_line:
	seek = m_kb_line.group('seek')
	repl = m_kb_line.group('repl')
	replacements.append((seek, repl))
	finally:
	if fpath_needs_closing:
	fh.close()

	return replacements


	def build_journals_re_kb(fpath):
	"""Load journals regexps knowledge base

	@see build_journals_kb
	"""
	def make_tuple(match):
	regexp = match.group('seek')
	repl = match.group('repl')
	return regexp, repl

	kb = []

	if isinstance(fpath, basestring):
	fpath_needs_closing = True
	try:
	fh = open(fpath, "r")
	except IOError:
	raise IOError("Error: Unable to open journal kb '%s'" % fpath)
	else:
	fpath_needs_closing = False
	fh = fpath

	try:
	for rawline in fh:
	if rawline.startswith('#'):
	continue
	# Extract the seek->replace terms from this KB line:
	m_kb_line = re_kb_line.search(rawline.decode('utf-8'))
	kb.append(make_tuple(m_kb_line))
	finally:
	if fpath_needs_closing:
	fh.close()

	return kb


	def load_kb_from_iterable(kb, builder):
	return builder(kb)


	def load_kb_from_file(path, builder):
	try:
	fh = open(path, "r")
	except IOError, e:
	raise StandardError("Unable to open kb '%s': %s" % (path, e))

	def lazy_parser(fh):
	for rawline in fh:
	if rawline.startswith('#'):
	continue

	try:
	rawline = rawline.decode("utf-8").rstrip("\n")
	except UnicodeError:
	raise StandardError("Unicode problems in kb %s at line %s"
	% (path, rawline))

	# Test line to ensure that it is a correctly formatted
	# knowledge base line:
	# Extract the seek->replace terms from this KB line
	m_kb_line = re_kb_line.search(rawline)
	if m_kb_line: # good KB line
	yield m_kb_line.group('seek'), m_kb_line.group('repl')
	else:
	raise StandardError("Badly formatted kb '%s' at line %s"
	% (path, rawline))

	try:
	return builder(lazy_parser(fh))
	finally:
	fh.close()


	def load_kb_from_db(kb_name, builder):
	def lazy_parser(kb):
	for mapping in kb:
	yield mapping['key'], mapping['value']

	return builder(lazy_parser(get_kbr_items(kb_name)))


	+def load_kb_from_records(kb_name, builder):
	+ def get_tag_values(recid, tags):
	+ for tag in tags:
	+ for value in get_fieldvalues(recid, tag):
	+ yield value
	+
	+ def lazy_parser(collection, left_tags, right_tags):
	+ for recid in get_collection_reclist(collection):
	+ try:
	+ # Key tag
	+ # e.g. for journals database: 711__a
	+ left_values = get_tag_values(recid, left_tags)
	+ except IndexError:
	+ pass
	+ else:
	+ # Value tags
	+ # e.g. for journals database: 130__a, 730__a and 030__a
	+ right_values = get_tag_values(recid, right_tags)
	+
	+ for left_value in set(left_values):
	+ for right_value in set(right_values):
	+ yield left_value, right_value
	+
	+ dummy, collection, left_str, right_str = kb_name.split(':')
	+ left_tags = left_str.split(',')
	+ right_tags = right_str.split(',')
	+ return builder(lazy_parser(collection, left_tags, right_tags))
	+
	+
	def build_journals_kb(knowledgebase):
	"""Given the path to a knowledge base file, read in the contents
	of that file into a dictionary of search->replace word phrases.
	The search phrases are compiled into a regex pattern object.
	The knowledge base file should consist only of lines that take
	the following format:
	seek-term --- replace-term
	(i.e. a seek phrase on the left hand side, a replace phrase on
	the right hand side, with the two phrases being separated by 3
	hyphens.) E.g.:
	ASTRONOMY AND ASTROPHYSICS ---Astron. Astrophys.

	The left-hand side term is a non-standard version of the title,
	whereas the right-hand side term is the standard version.
	If the KB file cannot be read from, or an unexpected line is
	encountered in the KB, an error
	message is output to standard error and execution is halted with
	an error-code 0.

	@param fpath: (string) the path to the knowledge base file.
	@return: (tuple) containing a list and a dictionary. The list
	contains compiled regex patterns used as search terms and will
	be used to force searching order to match that of the knowledge
	base.
	The dictionary contains the search->replace terms. The keys of
	the dictionary are the compiled regex word phrases used for
	searching in the reference lines; The values in the dictionary are
	the replace terms for matches.
	"""
	# Initialise vars:
	# dictionary of search and replace phrases from KB:
	kb = {}
	standardised_titles = {}
	seek_phrases = []
	# A dictionary of "replacement terms" (RHS) to be inserted into KB as
	# "seek terms" later, if they were not already explicitly added
	# by the KB:
	repl_terms = {}

	write_message('Processing journals kb', verbose=3)
	for seek_phrase, repl in knowledgebase:
	+ # We match on a simplified line, thus dots are replaced
	+ # with spaces
	+ seek_phrase = seek_phrase.replace('.', ' ').upper()
	+
	# good KB line
	# Add the 'replacement term' into the dictionary of
	# replacement terms:
	repl_terms[repl] = None

	# add the phrase from the KB if the 'seek' phrase is longer
	# compile the seek phrase into a pattern:
	seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase),
	re.UNICODE)

	kb[seek_phrase] = seek_ptn
	standardised_titles[seek_phrase] = repl
	seek_phrases.append(seek_phrase)

	# Now, for every 'replacement term' found in the KB, if it is
	# not already in the KB as a "search term", add it:
	for repl_term in repl_terms.keys():
	raw_repl_phrase = repl_term.upper()
	raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
	raw_repl_phrase = \
	re_group_captured_multiple_space.sub(u' ', raw_repl_phrase)
	raw_repl_phrase = raw_repl_phrase.strip()
	if raw_repl_phrase not in kb:
	# The replace-phrase was not in the KB as a seek phrase
	# It should be added.
	pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase)
	seek_ptn = re.compile(pattern, re.U)
	kb[raw_repl_phrase] = seek_ptn
	standardised_titles[raw_repl_phrase] = repl_term
	seek_phrases.append(raw_repl_phrase)

	# Sort the titles by string length (long - short)
	seek_phrases.sort(_cmp_bystrlen_reverse)

	write_message('Processed journals kb', verbose=3)

	# return the raw knowledge base:
	return kb, standardised_titles, seek_phrases


	def build_collaborations_kb(knowledgebase):
	kb = {}
	for pattern, collab in knowledgebase:
	prefix = ur"(?:^\|[\(\"\[\s]\|(?<=\W))\s*(?:(?:the\|and)\s+)?"
	collaboration_pattern = ur"(?:\s*coll(?:aborations?\|\.)?)?"
	suffix = ur"(?=$\|[><\]\)\"\s.,:])"
	pattern = pattern.replace(' ', '\s')
	pattern = pattern.replace('Collaboration', collaboration_pattern)
	re_pattern = "%s(%s)%s" % (prefix, pattern, suffix)
	kb[collab] = re.compile(re_pattern, re.I\|re.U)
	return kb
	diff --git a/modules/docextract/lib/refextract_record.py b/modules/docextract/lib/refextract_record.py
	index 5ec5daebc..c3f3bbadc 100644
	--- a/modules/docextract/lib/refextract_record.py
	+++ b/modules/docextract/lib/refextract_record.py
	@@ -1,257 +1,240 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2013 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	from datetime import datetime

	from invenio.docextract_record import BibRecord, \
	BibRecordField
	from invenio.refextract_config import \
	CFG_REFEXTRACT_FIELDS, \
	CFG_REFEXTRACT_IND1_REFERENCE, \
	CFG_REFEXTRACT_IND2_REFERENCE, \
	CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS, \
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS, \
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME, \
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION, \
	CFG_REFEXTRACT_VERSION

	from invenio import config
	CFG_INSPIRE_SITE = getattr(config, 'CFG_INSPIRE_SITE', False)


	def format_marker(line_marker):
	return line_marker.strip("[](){}. ")


	def build_record(counts, fields, recid=None, status_code=0):
	"""Given a series of MARC XML-ized reference lines and a record-id, write a
	MARC XML record to the stdout stream. Include in the record some stats
	for the extraction job.
	The printed MARC XML record will essentially take the following
	structure:
	<record>
	<controlfield tag="001">1</controlfield>
	<datafield tag="999" ind1="C" ind2="5">
	[...]
	</datafield>
	[...]
	<datafield tag="999" ind1="C" ind2="6">
	<subfield code="a">
	Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
	</subfield>
	</datafield>
	</record>
	Timestamp, error(code), reportnum, title, URL, and misc will are of
	course take the relevant values.

	@param status_code: (integer)the status of reference-extraction for the
	given record: was there an error or not? 0 = no error; 1 = error.
	@param count_reportnum: (integer) - the number of institutional
	report-number citations found in the document's reference lines.
	@param count_title: (integer) - the number of journal title citations
	found in the document's reference lines.
	@param count_url: (integer) - the number of URL citations found in the
	document's reference lines.
	@param count_misc: (integer) - the number of sections of miscellaneous
	text (i.e. 999C5$m) from the document's reference lines.
	@param count_auth_group: (integer) - the total number of author groups
	identified ($h)
	@param recid: (string) - the record-id of the given document. (put into
	001 field.)
	@param xml_lines: (list) of strings. Each string in the list contains a
	group of MARC XML 999C5 datafields, making up a single reference line.
	These reference lines will make up the document body.
	@return: The entire MARC XML textual output, plus recognition statistics.
	"""
	record = BibRecord(recid=recid)
	record['999'] = fields
	field = record.add_field(CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS)
	stats_str = "%(status)s-%(reportnum)s-%(title)s-%(author)s-%(url)s-%(doi)s-%(misc)s" % {
	'status' : status_code,
	'reportnum' : counts['reportnum'],
	'title' : counts['title'],
	'author' : counts['auth_group'],
	'url' : counts['url'],
	'doi' : counts['doi'],
	'misc' : counts['misc'],
	}
	field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS,
	stats_str)
	field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME,
	datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
	field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION,
	CFG_REFEXTRACT_VERSION)

	return record


	def build_references(citations):
	"""Build marc xml from a references list

	Transform the reference elements into marc xml
	"""
	# Now, run the method which will take as input:
	# 1. A list of lists of dictionaries, where each dictionary is a piece
	# of citation information corresponding to a tag in the citation.
	# 2. The line marker for this entire citation line (mulitple citation
	# 'finds' inside a single citation will use the same marker value)
	# The resulting xml line will be a properly marked up form of the
	# citation. It will take into account authors to try and split up
	# references which should be read as two SEPARATE ones.
	return [c for citation_elements in citations
	for elements in citation_elements['elements']
	for c in build_reference_fields(elements,
	citation_elements['line_marker'])]


	def add_subfield(field, code, value):
	return field.add_subfield(CFG_REFEXTRACT_FIELDS[code], value)


	def add_journal_subfield(field, element, inspire_format):
	if inspire_format:
	value = '%(title)s,%(volume)s,%(page)s' % element
	else:
	value = '%(title)s %(volume)s (%(year)s) %(page)s' % element

	return add_subfield(field, 'journal', value)


	def create_reference_field(line_marker):
	field = BibRecordField(ind1=CFG_REFEXTRACT_IND1_REFERENCE,
	ind2=CFG_REFEXTRACT_IND2_REFERENCE)
	if line_marker.strip("., [](){}"):
	add_subfield(field, 'linemarker', format_marker(line_marker))
	return field


	def build_reference_fields(citation_elements, line_marker, inspire_format=None):
	""" Create the MARC-XML string of the found reference information which
	was taken from a tagged reference line.
	@param citation_elements: (list) an ordered list of dictionary elements,
	with each element corresponding to a found
	piece of information from a reference line.
	@param line_marker: (string) The line marker for this single reference
	line (e.g. [19])
	@return xml_line: (string) The MARC-XML representation of the list of
	reference elements
	"""
	if inspire_format is None:
	inspire_format = CFG_INSPIRE_SITE

	## Begin the datafield element
	current_field = create_reference_field(line_marker)

	reference_fields = [current_field]

	- ## This will hold the ordering of tags which have been appended to the xml line
	- ## This list will be used to control the desisions involving the creation of new citation lines
	- ## (in the event of a new set of authors being recognised, or strange title ordering...)
	- line_elements = []
	-
	for element in citation_elements:
	## Before going onto checking 'what' the next element is, handle misc text and semi-colons
	## Multiple misc text subfields will be compressed later
	## This will also be the only part of the code that deals with MISC tag_typed elements
	misc_txt = element['misc_txt']
	if misc_txt.strip("., [](){}"):
	misc_txt = misc_txt.lstrip('])} ,.').rstrip('[({ ,.')
	add_subfield(current_field, 'misc', misc_txt)

	# Now handle the type dependent actions
	# JOURNAL
	if element['type'] == "JOURNAL":
	add_journal_subfield(current_field, element, inspire_format)
	- line_elements.append(element)

	# REPORT NUMBER
	elif element['type'] == "REPORTNUMBER":
	add_subfield(current_field, 'reportnumber', element['report_num'])
	- line_elements.append(element)

	# URL
	elif element['type'] == "URL":
	if element['url_string'] == element['url_desc']:
	# Build the datafield for the URL segment of the reference line:
	add_subfield(current_field, 'url', element['url_string'])
	# Else, in the case that the url string and the description differ in some way, include them both
	else:
	add_subfield(current_field, 'url', element['url_string'])
	add_subfield(current_field, 'urldesc', element['url_desc'])
	- line_elements.append(element)

	# DOI
	elif element['type'] == "DOI":
	add_subfield(current_field, 'doi', element['doi_string'])
	- line_elements.append(element)

	# AUTHOR
	elif element['type'] == "AUTH":
	value = element['auth_txt']
	if element['auth_type'] == 'incl':
	value = "(%s)" % value

	add_subfield(current_field, 'author', value)
	- line_elements.append(element)

	elif element['type'] == "QUOTED":
	add_subfield(current_field, 'title', element['title'])
	- line_elements.append(element)

	elif element['type'] == "ISBN":
	add_subfield(current_field, 'isbn', element['ISBN'])
	- line_elements.append(element)

	elif element['type'] == "BOOK":
	add_subfield(current_field, 'title', element['title'])
	- line_elements.append(element)

	elif element['type'] == "PUBLISHER":
	add_subfield(current_field, 'publisher', element['publisher'])
	- line_elements.append(element)

	elif element['type'] == "YEAR":
	add_subfield(current_field, 'year', element['year'])
	- line_elements.append(element)

	elif element['type'] == "COLLABORATION":
	add_subfield(current_field,
	'collaboration',
	element['collaboration'])
	- line_elements.append(element)

	elif element['type'] == "RECID":
	add_subfield(current_field, 'recid', str(element['recid']))
	- line_elements.append(element)

	for field in reference_fields:
	merge_misc(field)

	return reference_fields


	def merge_misc(field):
	current_misc = None
	for subfield in field.subfields[:]:
	if subfield.code == 'm':
	if current_misc is None:
	current_misc = subfield
	else:
	current_misc.value += " " + subfield.value
	field.subfields.remove(subfield)
	diff --git a/modules/docextract/lib/refextract_regression_tests.py b/modules/docextract/lib/refextract_regression_tests.py
	index be2d7eede..a5f2b6b79 100644
	--- a/modules/docextract/lib/refextract_regression_tests.py
	+++ b/modules/docextract/lib/refextract_regression_tests.py
	@@ -1,2816 +1,2853 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2010, 2011, 2013 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	The Refextract regression tests suite

	The tests will not modifiy the database.
	They are intended to make sure there is no regression in references parsing.
	"""

	from invenio.testutils import InvenioTestCase
	import re

	from invenio.testutils import make_test_suite, run_test_suite, InvenioXmlTestCase
	from invenio.refextract_engine import parse_references
	from invenio.docextract_utils import setup_loggers
	from invenio.refextract_text import wash_and_repair_reference_line
	from invenio import refextract_kbs
	from invenio import refextract_record


	def compare_references(test, record, expected_references, ignore_misc=True):
	# Remove the statistical datafield from the final extracted references
	record['999'] = record.find_fields('999C5')

	if ignore_misc:
	# We don't care about what's in the misc field
	for field in record['999']:
	field.subfields = [subfield for subfield in field.subfields
	if subfield.code != 'm']

	test.assertXmlEqual(record.to_xml(), expected_references.encode('utf-8'))


	def _reference_test(test, ref_line, parsed_reference, ignore_misc=True):
	#print u'refs: %s' % ref_line
	ref_line = wash_and_repair_reference_line(ref_line)
	#print u'cleaned: %s' % ref_line
	out = parse_references([ref_line], kbs_files={
	'journals' : test.kb_journals,
	'journals-re' : test.kb_journals_re,
	'report-numbers' : test.kb_report_numbers,
	'books' : test.kb_books,
	})
	compare_references(test, out, parsed_reference, ignore_misc=ignore_misc)


	class RefextractInvenioTest(InvenioXmlTestCase):

	def setUp(self):
	self.old_override = refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE
	refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE = {}

	self.old_inspire = refextract_record.CFG_INSPIRE_SITE
	refextract_record.CFG_INSPIRE_SITE = False

	setup_loggers(verbosity=0)
	self.maxDiff = 2000
	self.kb_journals = None
	self.kb_journals_re = None
	self.kb_report_numbers = None
	self.kb_authors = None
	self.kb_books = None
	self.kb_conferences = None

	def tearDown(self):
	refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE = self.old_override
	refextract_record.CFG_INSPIRE_SITE = self.old_inspire

	def test_month_with_year(self):
	ref_line = u"""[2] S. Weinberg, A Model of Leptons, Phys. Rev. Lett. 19 (Nov, 1967) 1264–1266."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2</subfield>
	<subfield code="h">S. Weinberg, A Model of Leptons</subfield>
	<subfield code="s">Phys. Rev. Lett. 19 (1967) 1264</subfield>
	<subfield code="y">1967</subfield>
	</datafield>
	</record>""")

	def test_numeration_not_finding_year(self):
	ref_line = u"""[137] M. Papakyriacou, H. Mayer, C. Pypen, H. P. Jr., and S. Stanzl-Tschegg, “Inﬂuence of loading frequency on high cycle fatigue properties of b.c.c. and h.c.p. metals,” Materials Science and Engineering, vol. A308, pp. 143–152, 2001."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">137</subfield>
	<subfield code="h">M. Papakyriacou, H. Mayer, C. Pypen, H. P. Jr., and S. Stanzl-Tschegg</subfield>
	<subfield code="t">Influence of loading frequency on high cycle fatigue properties of b.c.c. and h.c.p. metals</subfield>
	<subfield code="s">Mat.Sci.Eng. A308 (2001) 143</subfield>
	<subfield code="y">2001</subfield>
	</datafield>
	</record>""")

	def test_numeration_not_finding_year2(self):
	"""Bug fix test for numeration not finding year in this citation"""
	ref_line = u"""[138] Y.-B. Park, R. Mnig, and C. A. Volkert, “Frequency effect on thermal fatigue damage in Cu interconnects,” Thin Solid Films, vol. 515, pp. 3253– 3258, 2007."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">138</subfield>
	<subfield code="h">Y.-B. Park, R. Mnig, and C. A. Volkert</subfield>
	<subfield code="t">Frequency effect on thermal fatigue damage in Cu interconnects</subfield>
	<subfield code="s">Thin Solid Films 515 (2007) 3253</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_extra_a_in_report_number(self):
	ref_line = u'[14] CMS Collaboration, CMS-PAS-HIG-12-002. CMS Collaboration, CMS-PAS-HIG-12-008. CMS Collaboration, CMS-PAS-HIG-12-022. ATLAS Collaboration, arXiv:1205.0701. ATLAS Collaboration, ATLAS-CONF-2012-078.'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="c">CMS Collaboration</subfield>
	<subfield code="r">CMS-PAS-HIG-12-002</subfield>
	<subfield code="c">CMS Collaboration</subfield>
	<subfield code="r">CMS-PAS-HIG-12-008</subfield>
	<subfield code="c">CMS Collaboration</subfield>
	<subfield code="r">CMS-PAS-HIG-12-022</subfield>
	<subfield code="c">ATLAS Collaboration</subfield>
	<subfield code="r">arXiv:1205.0701</subfield>
	<subfield code="c">ATLAS Collaboration</subfield>
	<subfield code="r">ATLAS-CONF-2012-078</subfield>
	</datafield>
	</record>""")


	class RefextractTest(InvenioXmlTestCase):
	"""Testing output of refextract"""

	def setUp(self):
	self.old_inspire = refextract_record.CFG_INSPIRE_SITE
	refextract_record.CFG_INSPIRE_SITE = True

	self.inspire = True
	self.kb_books = [
	('Griffiths, David', 'Introduction to elementary particles', '2008')
	]
	self.kb_journals = [
	("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", "Phys.Rev.ST Accel.Beams"),
	("PHYS REV D", "Phys.Rev.;D"),
	("PHYS REV", "Phys.Rev."),
	("PHYS REV LETT", "Phys.Rev.Lett."),
	("PHYS LETT", "Phys.Lett."),
	("J PHYS", "J.Phys."),
	("JOURNAL OF PHYSICS", "J.Phys."),
	("J PHYS G", "J.Phys.;G"),
	("PHYSICAL REVIEW", "Phys.Rev."),
	("ADV THEO MATH PHYS", "Adv.Theor.Math.Phys."),
	("MATH PHYS", "Math.Phys."),
	("J MATH PHYS", "J.Math.Phys."),
	("JHEP", "JHEP"),
	("SITZUNGSBER PREUSS AKAD WISS PHYS MATH KL", "Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.)"),
	("PHYS LETT", "Phys.Lett."),
	("NUCL PHYS", "Nucl.Phys."),
	("NUCL PHYS", "Nucl.Phys."),
	("NUCL PHYS PROC SUPPL", "Nucl.Phys.Proc.Suppl."),
	("JINST", "JINST"),
	("THE EUROPEAN PHYSICAL JOURNAL C PARTICLES AND FIELDS", "Eur.Phys.J.;C"),
	("COMMUN MATH PHYS", "Commun.Math.Phys."),
	("COMM MATH PHYS", "Commun.Math.Phys."),
	("REV MOD PHYS", "Rev.Mod.Phys."),
	("ANN PHYS U S", "Ann.Phys."),
	("AM J PHYS", "Am.J.Phys."),
	("PROC R SOC LONDON SER", "Proc.Roy.Soc.Lond."),
	("CLASS QUANT GRAVITY", "Class.Quant.Grav."),
	("FOUND PHYS", "Found.Phys."),
	("IEEE TRANS NUCL SCI", "IEEE Trans.Nucl.Sci."),
	("SCIENCE", "Science"),
	("ACTA MATERIALIA", "Acta Mater."),
	("REVIEWS OF MODERN PHYSICS", "Rev.Mod.Phys."),
	("NUCL INSTRUM METHODS", "Nucl.Instrum.Meth."),
	("Z PHYS", "Z.Phys."),
	("Eur. Phys. J.", "Eur.Phys.J."),
	]
	self.kb_journals_re = [
	"DAN---Dokl.Akad.Nauk Ser.Fiz.",
	]
	self.kb_report_numbers = [
	"#####CERN#####",
	"< yy 999>",
	"< yyyy 999>",
	"ATL PHYS INT---ATL-PHYS-INT",
	"#####LHC#####",
	"< yy 999>",
	"<syyyy 999>",
	"< 999>",
	"< 9999>",
	"CERN LHC PROJECT REPORT---CERN-LHC-Project-Report",
	"CLIC NOTE ---CERN-CLIC-Note",
	"CERN LHCC ---CERN-LHCC",
	"CERN EP ---CERN-EP",
	"######ATLANTIS#######",
	"< 9999999>",
	"CERN EX---CERN-EX",
	]
	setup_loggers(verbosity=0)
	self.maxDiff = 2500

	def tearDown(self):
	refextract_record.CFG_INSPIRE_SITE = self.old_inspire

	def test_year_title_volume_page(self):
	ref_line = u"[14] L. Randall and R. Sundrum, (1999) Phys. Rev. Lett. B83 S08004 More text"
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="h">L. Randall and R. Sundrum</subfield>
	<subfield code="s">Phys.Rev.Lett.,B83,S08004</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_url1(self):
	ref_line = u"""[1] <a href="http://cdsweb.cern.ch/">CERN Document Server</a> J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231, hep-th/9711200; http://cdsweb.cern.ch/ then http://www.itp.ucsb.edu/online/susyc99/discussion/. ; L. Susskind, J. Math. Phys. 36 (1995) 6377, hep-th/9409089; hello world a<a href="http://uk.yahoo.com/">Yahoo!</a>. Fin."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="u">http://cdsweb.cern.ch/</subfield>
	<subfield code="z">CERN Document Server</subfield>
	<subfield code="h">J. Maldacena</subfield>
	<subfield code="s">Adv.Theor.Math.Phys.,2,231</subfield>
	<subfield code="r">hep-th/9711200</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="u">http://cdsweb.cern.ch/</subfield>
	<subfield code="u">http://www.itp.ucsb.edu/online/susyc99/discussion/</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">L. Susskind</subfield>
	<subfield code="s">J.Math.Phys.,36,6377</subfield>
	<subfield code="r">hep-th/9409089</subfield>
	<subfield code="y">1995</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="u">http://uk.yahoo.com/</subfield>
	<subfield code="z">Yahoo!</subfield>
	</datafield>
	</record>""")

	def test_url2(self):
	ref_line = u"""[2] J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231; hep-th/9711200. http://cdsweb.cern.ch/"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2</subfield>
	<subfield code="h">J. Maldacena</subfield>
	<subfield code="s">Adv.Theor.Math.Phys.,2,231</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2</subfield>
	<subfield code="r">hep-th/9711200</subfield>
	<subfield code="u">http://cdsweb.cern.ch/</subfield>
	</datafield>
	</record>""")

	def test_url3(self):
	ref_line = u"3. “pUML Initial Submission to OMG’ s RFP for UML 2.0 Infrastructure”. URL http://www.cs.york.ac.uk/puml/"
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">3</subfield>
	<subfield code="t">pUML Initial Submission to OMG\u2019 s RFP for UML 2.0 Infrastructure</subfield>
	<subfield code="u">http://www.cs.york.ac.uk/puml/</subfield>
	</datafield>
	</record>""")

	def test_url4(self):
	ref_line = u"""[3] S. Gubser, I. Klebanov and A. Polyakov, Phys. Lett. B428 (1998) 105; hep-th/9802109. http://cdsweb.cern.ch/search.py?AGE=hello-world&ln=en"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">3</subfield>
	<subfield code="h">S. Gubser, I. Klebanov and A. Polyakov</subfield>
	<subfield code="s">Phys.Lett.,B428,105</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">3</subfield>
	<subfield code="r">hep-th/9802109</subfield>
	<subfield code="u">http://cdsweb.cern.ch/search.py?AGE=hello-world&ln=en</subfield>
	</datafield>
	</record>""")

	def test_url5(self):
	ref_line = u"""[9] H. J. Drescher and Y. Nara, Phys. Rev. C 75, 034905 (2007); MC-KLN 3.46 at http://www.aiu.ac.jp/ynara/mckln/."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">9</subfield>
	<subfield code="h">H. J. Drescher and Y. Nara</subfield>
	<subfield code="s">Phys.Rev.,C75,034905</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">9</subfield>
	<subfield code="u">http://www.aiu.ac.jp/ynara/mckln/</subfield>
	</datafield>
	</record>""")

	def test_hep(self):
	ref_line = u"""[5] O. Aharony, S. Gubser, J. Maldacena, H. Ooguri and Y. Oz, hep-th/9905111."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">5</subfield>
	<subfield code="h">O. Aharony, S. Gubser, J. Maldacena, H. Ooguri and Y. Oz</subfield>
	<subfield code="r">hep-th/9905111</subfield>
	</datafield>
	</record>""")

	def test_hep2(self):
	ref_line = u"""[4] E. Witten, Adv. Theor. Math. Phys. 2 (1998) 253; hep-th/9802150."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="h">E. Witten</subfield>
	<subfield code="s">Adv.Theor.Math.Phys.,2,253</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="r">hep-th/9802150</subfield>
	</datafield>
	</record>""")

	def test_hep3(self):
	ref_line = u"""[6] L. Susskind, J. Math. Phys. 36 (1995) 6377; hep-th/9409089."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">L. Susskind</subfield>
	<subfield code="s">J.Math.Phys.,36,6377</subfield>
	<subfield code="y">1995</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="r">hep-th/9409089</subfield>
	</datafield>
	</record>""")

	def test_hep4(self):
	ref_line = u"""[7] L. Susskind and E. Witten, hep-th/9805114."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">7</subfield>
	<subfield code="h">L. Susskind and E. Witten</subfield>
	<subfield code="r">hep-th/9805114</subfield>
	</datafield>
	</record>""")

	def test_double_hep_no_semi_colon(self):
	ref_line = u"""[7] W. Fischler and L. Susskind, hep-th/9806039; N. Kaloper and A. Linde, Phys. Rev. D60 (1999) 105509, hep-th/9904120."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">7</subfield>
	<subfield code="h">W. Fischler and L. Susskind</subfield>
	<subfield code="r">hep-th/9806039</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">7</subfield>
	<subfield code="h">N. Kaloper and A. Linde</subfield>
	<subfield code="s">Phys.Rev.,D60,105509</subfield>
	<subfield code="r">hep-th/9904120</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_journal_colon_sep(self):
	ref_line = u"""[9] R. Bousso, JHEP 9906:028 (1999); hep-th/9906022."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">9</subfield>
	<subfield code="h">R. Bousso</subfield>
	<subfield code="s">JHEP,9906,028</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">9</subfield>
	<subfield code="r">hep-th/9906022</subfield>
	</datafield>
	</record>""")

	def test_book1(self):
	"""book with authors and title but no quotes"""
	ref_line = u"""[10] R. Penrose and W. Rindler, Spinors and Spacetime, volume 2, chapter 9 (Cambridge University Press, Cambridge, 1986)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">10</subfield>
	<subfield code="h">R. Penrose and W. Rindler</subfield>
	</datafield>
	</record>""")

	def test_hep_combined(self):
	ref_line = u"""[11] R. Britto-Pacumio, A. Strominger and A. Volovich, JHEP 9911:013 (1999); hep-th/9905210; blah hep-th/9905211; blah hep-ph/9711200"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">11</subfield>
	<subfield code="h">R. Britto-Pacumio, A. Strominger and A. Volovich</subfield>
	<subfield code="s">JHEP,9911,013</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">11</subfield>
	<subfield code="r">hep-th/9905210</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">11</subfield>
	<subfield code="r">hep-th/9905211</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">11</subfield>
	<subfield code="r">hep-ph/9711200</subfield>
	</datafield>
	</record>""")

	def test_misc5(self):
	ref_line = u"""[12] V. Balasubramanian and P. Kraus, Commun. Math. Phys. 208 (1999) 413; hep-th/9902121."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="h">V. Balasubramanian and P. Kraus</subfield>
	<subfield code="s">Commun.Math.Phys.,208,413</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="r">hep-th/9902121</subfield>
	</datafield>
	</record>""")

	def test_misc6(self):
	ref_line = u"""[13] V. Balasubramanian and P. Kraus, Phys. Rev. Lett. 83 (1999) 3605; hep-th/9903190."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">13</subfield>
	<subfield code="h">V. Balasubramanian and P. Kraus</subfield>
	<subfield code="s">Phys.Rev.Lett.,83,3605</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">13</subfield>
	<subfield code="r">hep-th/9903190</subfield>
	</datafield>
	</record>""")

	def test_hep5(self):
	ref_line = u"""[14] P. Kraus, F. Larsen and R. Siebelink, hep-th/9906127."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="h">P. Kraus, F. Larsen and R. Siebelink</subfield>
	<subfield code="r">hep-th/9906127</subfield>
	</datafield>
	</record>""")

	def test_report1(self):
	ref_line = u"""[15] L. Randall and R. Sundrum, Phys. Rev. Lett. 83 (1999) 4690; hep-th/9906064. this is a test RN of a different type: CERN-LHC-Project-Report-2006. more text."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">L. Randall and R. Sundrum</subfield>
	<subfield code="s">Phys.Rev.Lett.,83,4690</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="r">hep-th/9906064</subfield>
	<subfield code="r">CERN-LHC-Project-Report-2006</subfield>
	</datafield>
	</record>""")

	def test_hep6(self):
	ref_line = u"""[16] S. Gubser, hep-th/9912001."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">16</subfield>
	<subfield code="h">S. Gubser</subfield>
	<subfield code="r">hep-th/9912001</subfield>
	</datafield>
	</record>""")

	def test_triple_hep(self):
	ref_line = u"""[17] H. Verlinde, hep-th/9906182; H. Verlinde, hep-th/9912018; J. de Boer, E. Verlinde and H. Verlinde, hep-th/9912012."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">17</subfield>
	<subfield code="h">H. Verlinde</subfield>
	<subfield code="r">hep-th/9906182</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">17</subfield>
	<subfield code="h">H. Verlinde</subfield>
	<subfield code="r">hep-th/9912018</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">17</subfield>
	<subfield code="h">J. de Boer, E. Verlinde and H. Verlinde</subfield>
	<subfield code="r">hep-th/9912012</subfield>
	</datafield>
	</record>""")

	def test_url_no_tag(self):
	ref_line = u"""[18] E. Witten, remarks at ITP Santa Barbara conference, "New dimensions in field theory and string theory": http://www.itp.ucsb.edu/online/susyc99/discussion/."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">18</subfield>
	<subfield code="h">E. Witten</subfield>
	<subfield code="t">New dimensions in field theory and string theory</subfield>
	<subfield code="u">http://www.itp.ucsb.edu/online/susyc99/discussion/</subfield>
	</datafield>
	</record>""")

	def test_journal_simple(self):
	ref_line = u"""[19] D. Page and C. Pope, Commun. Math. Phys. 127 (1990) 529."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">19</subfield>
	<subfield code="h">D. Page and C. Pope</subfield>
	<subfield code="s">Commun.Math.Phys.,127,529</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_unknown_report(self):
	ref_line = u"""[20] M. Duff, B. Nilsson and C. Pope, Physics Reports 130 (1986), chapter 9."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">20</subfield>
	<subfield code="h">M. Duff, B. Nilsson and C. Pope</subfield>
	</datafield>
	</record>""")

	def test_journal_volume_with_letter(self):
	ref_line = u"""[21] D. Page, Phys. Lett. B79 (1978) 235."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">21</subfield>
	<subfield code="h">D. Page</subfield>
	<subfield code="s">Phys.Lett.,B79,235</subfield>
	<subfield code="y">1978</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep1(self):
	ref_line = u"""[22] M. Cassidy and S. Hawking, Phys. Rev. D57 (1998) 2372, hep-th/9709066; S. Hawking, Phys. Rev. D52 (1995) 5681."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">22</subfield>
	<subfield code="h">M. Cassidy and S. Hawking</subfield>
	<subfield code="s">Phys.Rev.,D57,2372</subfield>
	<subfield code="r">hep-th/9709066</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">22</subfield>
	<subfield code="h">S. Hawking</subfield>
	<subfield code="s">Phys.Rev.,D52,5681</subfield>
	<subfield code="y">1995</subfield>
	</datafield>
	</record>""")

	def test_hep7(self):
	ref_line = u"""[23] K. Skenderis and S. Solodukhin, hep-th/9910023."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">23</subfield>
	<subfield code="h">K. Skenderis and S. Solodukhin</subfield>
	<subfield code="r">hep-th/9910023</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep2(self):
	ref_line = u"""[24] M. Henningson and K. Skenderis, JHEP 9807:023 (1998), hep-th/9806087."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">24</subfield>
	<subfield code="h">M. Henningson and K. Skenderis</subfield>
	<subfield code="s">JHEP,9807,023</subfield>
	<subfield code="r">hep-th/9806087</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	</record>""")

	def test_unknown_book(self):
	ref_line = u"""[25] C. Fefferman and C. Graham, "Conformal Invariants", in Elie Cartan et les Mathematiques d'aujourd'hui (Asterisque, 1985) 95."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">25</subfield>
	<subfield code="h">C. Fefferman and C. Graham</subfield>
	<subfield code="t">Conformal Invariants</subfield>
	</datafield>
	</record>""")

	def test_hep8(self):
	ref_line = u"""[27] E. Witten and S.-T. Yau, hep-th/9910245."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">27</subfield>
	<subfield code="h">E. Witten and S.-T. Yau</subfield>
	<subfield code="r">hep-th/9910245</subfield>
	</datafield>
	</record>""")

	def test_hep9(self):
	ref_line = u"""[28] R. Emparan, JHEP 9906:036 (1999); hep-th/9906040."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">28</subfield>
	<subfield code="h">R. Emparan</subfield>
	<subfield code="s">JHEP,9906,036</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">28</subfield>
	<subfield code="r">hep-th/9906040</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep3(self):
	ref_line = u"""[29] A. Chamblin, R. Emparan, C. Johnson and R. Myers, Phys. Rev. D59 (1999) 64010, hep-th/9808177; S. Hawking, C. Hunter and D. Page, Phys. Rev. D59 (1998) 44033, hep-th/9809035."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">29</subfield>
	<subfield code="h">A. Chamblin, R. Emparan, C. Johnson and R. Myers</subfield>
	<subfield code="s">Phys.Rev.,D59,64010</subfield>
	<subfield code="r">hep-th/9808177</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">29</subfield>
	<subfield code="h">S. Hawking, C. Hunter and D. Page</subfield>
	<subfield code="s">Phys.Rev.,D59,44033</subfield>
	<subfield code="r">hep-th/9809035</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep4(self):
	ref_line = u"""[30] S. Sethi and L. Susskind, Phys. Lett. B400 (1997) 265, hep-th/9702101; T. Banks and N. Seiberg, Nucl. Phys. B497 (1997) 41, hep-th/9702187."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">30</subfield>
	<subfield code="h">S. Sethi and L. Susskind</subfield>
	<subfield code="s">Phys.Lett.,B400,265</subfield>
	<subfield code="r">hep-th/9702101</subfield>
	<subfield code="y">1997</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">30</subfield>
	<subfield code="h">T. Banks and N. Seiberg</subfield>
	<subfield code="s">Nucl.Phys.,B497,41</subfield>
	<subfield code="r">hep-th/9702187</subfield>
	<subfield code="y">1997</subfield>
	</datafield>
	</record>""")

	def test_misc7(self):
	ref_line = u"""[31] R. Emparan, C. Johnson and R. Myers, Phys. Rev. D60 (1999) 104001; hep-th/9903238."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">31</subfield>
	<subfield code="h">R. Emparan, C. Johnson and R. Myers</subfield>
	<subfield code="s">Phys.Rev.,D60,104001</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">31</subfield>
	<subfield code="r">hep-th/9903238</subfield>
	</datafield>
	</record>""")

	def test_misc8(self):
	ref_line = u"""[32] S. Hawking, C. Hunter and M. Taylor-Robinson, Phys. Rev. D59 (1999) 064005; hep-th/9811056."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">32</subfield>
	<subfield code="h">S. Hawking, C. Hunter and M. Taylor-Robinson</subfield>
	<subfield code="s">Phys.Rev.,D59,064005</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">32</subfield>
	<subfield code="r">hep-th/9811056</subfield>
	</datafield>
	</record>""")

	def test_misc9(self):
	ref_line = u"""[33] J. Dowker, Class. Quant. Grav. 16 (1999) 1937; hep-th/9812202."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">33</subfield>
	<subfield code="h">J. Dowker</subfield>
	<subfield code="s">Class.Quant.Grav.,16,1937</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">33</subfield>
	<subfield code="r">hep-th/9812202</subfield>
	</datafield>
	</record>""")

	def test_journal3(self):
	ref_line = u"""[34] J. Brown and J. York, Phys. Rev. D47 (1993) 1407."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">34</subfield>
	<subfield code="h">J. Brown and J. York</subfield>
	<subfield code="s">Phys.Rev.,D47,1407</subfield>
	<subfield code="y">1993</subfield>
	</datafield>
	</record>""")

	def test_misc10(self):
	ref_line = u"""[35] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A 546 (1999) 96"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">35</subfield>
	<subfield code="h">D. Freedman, S. Mathur, A. Matsuis and L. Rastelli</subfield>
	<subfield code="s">Nucl.Phys.,B546,96</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">35</subfield>
	<subfield code="r">hep-th/9804058</subfield>
	<subfield code="h">D. Freedman, S. Mathur, A. Matsuis and L. Rastelli</subfield>
	<subfield code="s">Nucl.Phys.,A546,96</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_misc11(self):
	ref_line = u"""[36] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">36</subfield>
	<subfield code="h">D. Freedman, S. Mathur, A. Matsuis and L. Rastelli</subfield>
	<subfield code="s">Nucl.Phys.,B546,96</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">36</subfield>
	<subfield code="r">hep-th/9804058</subfield>
	</datafield>
	</record>""")

	def test_misc12(self):
	ref_line = u"""[37] some misc lkjslkdjlksjflksj [hep-th/0703265] lkjlkjlkjlkj [hep-th/0606096], hep-ph/0002060, some more misc; Nucl. Phys. B546 (1999) 96"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="r">hep-th/0703265</subfield>
	<subfield code="0">93</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="r">hep-th/0606096</subfield>
	<subfield code="0">92</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="r">hep-ph/0002060</subfield>
	<subfield code="0">96</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="s">Nucl.Phys.,B546,96</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_misc13(self):
	ref_line = u"""[38] R. Emparan, C. Johnson and R.. Myers, Phys. Rev. D60 (1999) 104001; this is :: .... misc! hep-th/0703265. and some ...,.,.,.,::: more hep-th/0606096"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">38</subfield>
	<subfield code="h">R. Emparan, C. Johnson and R.. Myers</subfield>
	<subfield code="s">Phys.Rev.,D60,104001</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">38</subfield>
	<subfield code="r">hep-th/0703265</subfield>
	<subfield code="0">93</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">38</subfield>
	<subfield code="r">hep-th/0606096</subfield>
	<subfield code="0">92</subfield>
	</datafield>
	</record>""")

	def test_misc14(self):
	"""Same as test_misc12 but with unknow report numbers to the system"""
	ref_line = u"""[37] some misc lkjslkdjlksjflksj [hep-th/9206059] lkjlkjlkjlkj [hep-th/9206060], hep-ph/9206061, some more misc; Nucl. Phys. B546 (1999) 96"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="r">hep-th/9206059</subfield>
	<subfield code="r">hep-th/9206060</subfield>
	<subfield code="r">hep-ph/9206061</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="s">Nucl.Phys.,B546,96</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_misc15(self):
	"""Same as test_misc13 but with unknow report numbers to the system"""
	ref_line = u"""[38] R. Emparan, C. Johnson and R.. Myers, Phys. Rev. D60 (1999) 104001; this is :: .... misc! hep-th/9206059. and some ...,.,.,.,::: more hep-th/9206060"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">38</subfield>
	<subfield code="h">R. Emparan, C. Johnson and R.. Myers</subfield>
	<subfield code="s">Phys.Rev.,D60,104001</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">38</subfield>
	<subfield code="r">hep-th/9206059</subfield>
	<subfield code="r">hep-th/9206060</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep5(self):
	ref_line = u"""[39] A. Ceresole, G. Dall Agata and R. D Auria, JHEP 11(1999) 009, [hep-th/9907216]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">A. Ceresole, G. Dall Agata and R. D Auria</subfield>
	<subfield code="s">JHEP,9911,009</subfield>
	<subfield code="r">hep-th/9907216</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep6(self):
	ref_line = u"""[40] D.P. Jatkar and S. Randjbar-Daemi, Phys. Lett. B460, 281 (1999) [hep-th/9904187]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">40</subfield>
	<subfield code="h">D.P. Jatkar and S. Randjbar-Daemi</subfield>
	<subfield code="s">Phys.Lett.,B460,281</subfield>
	<subfield code="r">hep-th/9904187</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_journal_with_hep7(self):
	ref_line = u"""[41] G. DallAgata, Phys. Lett. B460, (1999) 79, [hep-th/9904198]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">41</subfield>
	<subfield code="h">G. DallAgata</subfield>
	<subfield code="s">Phys.Lett.,B460,79</subfield>
	<subfield code="r">hep-th/9904198</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_journal_year_volume_page(self):
	ref_line = u"""[43] Becchi C., Blasi A., Bonneau G., Collina R., Delduc F., Commun. Math. Phys., 1988, 120, 121."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">43</subfield>
	<subfield code="h">Becchi C., Blasi A., Bonneau G., Collina R., Delduc F.</subfield>
	<subfield code="s">Commun.Math.Phys.,120,121</subfield>
	<subfield code="y">1988</subfield>
	</datafield>
	</record>""")

	def test_journal_volume_year_page1(self):
	ref_line = u"""[44]: N. Nekrasov, A. Schwarz, Instantons on noncommutative R4 and (2, 0) superconformal six-dimensional theory, Comm. Math. Phys., 198, (1998), 689-703."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">44</subfield>
	<subfield code="h">N. Nekrasov, A. Schwarz</subfield>
	<subfield code="s">Commun.Math.Phys.,198,689</subfield>
	<subfield code="y">1998</subfield>
	</datafield>
	</record>""")

	def test_journal_volume_year_page2(self):
	ref_line = u"""[42] S.M. Donaldson, Instantons and Geometric Invariant Theory, Comm. Math. Phys., 93, (1984), 453-460."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">42</subfield>
	<subfield code="h">S.M. Donaldson</subfield>
	<subfield code="s">Commun.Math.Phys.,93,453</subfield>
	<subfield code="y">1984</subfield>
	</datafield>
	</record>""")

	def test_many_references_in_one_line(self):
	ref_line = u"""[45] H. J. Bhabha, Rev. Mod. Phys. 17, 200(1945); ibid, 21, 451(1949); S. Weinberg, Phys. Rev. 133, B1318(1964); ibid, 134, 882(1964); D. L. Pursey, Ann. Phys(U. S)32, 157(1965); W. K. Tung, Phys, Rev. Lett. 16, 763(1966); Phys. Rev. 156, 1385(1967); W. J. Hurley, Phys. Rev. Lett. 29, 1475(1972)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">H. J. Bhabha</subfield>
	<subfield code="s">Rev.Mod.Phys.,17,200</subfield>
	<subfield code="y">1945</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">H. J. Bhabha</subfield>
	<subfield code="s">Rev.Mod.Phys.,21,451</subfield>
	<subfield code="y">1949</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">S. Weinberg</subfield>
	<subfield code="s">Phys.Rev.,133,B1318</subfield>
	<subfield code="y">1964</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">S. Weinberg</subfield>
	<subfield code="s">Phys.Rev.,134,882</subfield>
	<subfield code="y">1964</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">D. L. Pursey</subfield>
	<subfield code="s">Ann.Phys.,32,157</subfield>
	<subfield code="y">1965</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">W. K. Tung</subfield>
	<subfield code="s">Phys.Rev.Lett.,16,763</subfield>
	<subfield code="y">1966</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="s">Phys.Rev.,156,1385</subfield>
	<subfield code="y">1967</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">45</subfield>
	<subfield code="h">W. J. Hurley</subfield>
	<subfield code="s">Phys.Rev.Lett.,29,1475</subfield>
	<subfield code="y">1972</subfield>
	</datafield>
	</record>""")

	def test_ibid(self):
	"""Simple ibid test"""
	ref_line = u"""[46] E. Schrodinger, Sitzungsber. Preuss. Akad. Wiss. Phys. Math. Kl. 24, 418(1930); ibid, 3, 1(1931)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.),24,418</subfield>
	<subfield code="y">1930</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.),3,1</subfield>
	<subfield code="y">1931</subfield>
	</datafield>
	</record>""")

	def test_ibid2(self):
	"Series has to be recognized for ibid to work properly"
	ref_line = u"""[46] E. Schrodinger, J.Phys. G 24, 418 (1930); ibid, 3, 1(1931)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">J.Phys.,G24,418</subfield>
	<subfield code="y">1930</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">J.Phys.,G3,1</subfield>
	<subfield code="y">1931</subfield>
	</datafield>
	</record>""")

	def test_ibid3(self):
	"Series after volume has to be recognized for ibid to work properly"
	ref_line = u"""[46] E. Schrodinger, J.Phys. G 24, 418 (1930); ibid, 3, 1(1931)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">J.Phys.,G24,418</subfield>
	<subfield code="y">1930</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">J.Phys.,G3,1</subfield>
	<subfield code="y">1931</subfield>
	</datafield>
	</record>""")

	def test_ibid4(self):
	"Series has to be recognized for ibid to work properly"
	ref_line = u"""[46] E. Schrodinger, J.Phys. G 24, 418 (1930); ibid, A 3, 1(1931)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">J.Phys.,G24,418</subfield>
	<subfield code="y">1930</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	<subfield code="s">J.Phys.,A3,1</subfield>
	<subfield code="y">1931</subfield>
	</datafield>
	</record>""")

	def test_invalid_ibid(self):
	"Ibid with no preceding journals, needs to go to misc text"
	ref_line = u"""[46] E. Schrodinger, ibid, 3, 1(1931)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">46</subfield>
	<subfield code="h">E. Schrodinger</subfield>
	</datafield>
	</record>""")

	def test_misc4(self):
	ref_line = u"""[47] P. A. M. Dirac, Proc. R. Soc. London, Ser. A155, 447(1936); ibid, D24, 3333(1981)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">47</subfield>
	<subfield code="h">P. A. M. Dirac</subfield>
	<subfield code="s">Proc.Roy.Soc.Lond.,A155,447</subfield>
	<subfield code="y">1936</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">47</subfield>
	<subfield code="h">P. A. M. Dirac</subfield>
	<subfield code="s">Proc.Roy.Soc.Lond.,D24,3333</subfield>
	<subfield code="y">1981</subfield>
	</datafield>
	</record>""")

	def test_doi(self):
	ref_line = u"""[48] O.O. Vaneeva, R.O. Popovych and C. Sophocleous, Enhanced Group Analysis and Exact Solutions of Vari-able Coefficient Semilinear Diffusion Equations with a Power Source, Acta Appl. Math., doi:10.1007/s10440-008-9280-9, 46 p., arXiv:0708.3457."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">48</subfield>
	<subfield code="h">O.O. Vaneeva, R.O. Popovych and C. Sophocleous</subfield>
	<subfield code="a">10.1007/s10440-008-9280-9</subfield>
	<subfield code="r">arXiv:0708.3457</subfield>
	</datafield>
	</record>""")

	def test_doi2(self):
	ref_line = u"""[1] http://dx.doi.org/10.1175/1520-0442(2000)013<2671:TAORTT>2.0.CO;2"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="a">10.1175/1520-0442(2000)013<2671:TAORTT>2.0.CO;2</subfield>
	</datafield>
	</record>""")

	def test_misc3(self):
	ref_line = u"""[49] M. I. Trofimov, N. De Filippis and E. A. Smolenskii. Application of the electronegativity indices of organic molecules to tasks of chemical informatics. Russ. Chem. Bull., 54:2235-2246, 2005. http://dx.doi.org/10.1007/s11172-006-0105-6."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">49</subfield>
	<subfield code="h">M. I. Trofimov, N. De Filippis and E. A. Smolenskii</subfield>
	<subfield code="a">10.1007/s11172-006-0105-6</subfield>
	</datafield>
	</record>""")

	def test_misc2(self):
	ref_line = u"""[50] M. Gell-Mann, P. Ramon ans R. Slansky, in Supergravity, P. van Niewenhuizen and D. Freedman (North-Holland 1979); T. Yanagida, in Proceedings of the Workshop on the Unified Thoery and the Baryon Number in teh Universe, ed. O. Sawaga and A. Sugamoto (Tsukuba 1979); R.N. Mohapatra and G. Senjanovic, Phys. Rev. Lett. 44, 912, (1980)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">50</subfield>
	<subfield code="h">M. Gell-Mann, P. Ramon ans R. Slansky</subfield>
	<subfield code="p">North-Holland</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">50</subfield>
	<subfield code="h">T. Yanagida</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">50</subfield>
	<subfield code="h">R.N. Mohapatra and G. Senjanovic</subfield>
	<subfield code="s">Phys.Rev.Lett.,44,912</subfield>
	<subfield code="y">1980</subfield>
	</datafield>
	</record>""")

	def test_misc1(self):
	ref_line = u"""[51] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., eds. Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990);"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">51</subfield>
	<subfield code="h">L.S. Durkin and P. Langacker</subfield>
	<subfield code="s">Phys.Lett.,B166,436</subfield>
	<subfield code="y">1986</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">51</subfield>
	<subfield code="h">Amaldi et al.</subfield>
	<subfield code="s">Phys.Rev.,D36,1385</subfield>
	<subfield code="y">1987</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">51</subfield>
	<subfield code="h">(Hayward and Yellow et al. (eds.))</subfield>
	<subfield code="s">Phys.Lett.,B245,669</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">51</subfield>
	<subfield code="s">Nucl.Phys.,B342,15</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_combination_of_authors_names(self):
	"""authors names in varied formats"""
	ref_line = u"""[53] Hush, D.R., R.Leighton, and B.G. Horne, 1993. "Progress in supervised Neural Netw. What's new since Lippmann?" IEEE Signal Process. Magazine 10, 8-39"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">53</subfield>
	<subfield code="h">Hush, D.R., R.Leighton, and B.G. Horne</subfield>
	<subfield code="t">Progress in supervised Neural Netw. What's new since Lippmann?</subfield>
	<subfield code="p">IEEE</subfield>
	</datafield>
	</record>""")

	def test_two_initials_no_space(self):
	ref_line = u"""[54] T.G. Rizzo, Phys. Rev. D40, 3035 (1989)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">54</subfield>
	<subfield code="h">T.G. Rizzo</subfield>
	<subfield code="s">Phys.Rev.,D40,3035</subfield>
	<subfield code="y">1989</subfield>
	</datafield>
	</record>""")

	def test_surname_prefix_van(self):
	"""An author with prefix + surname
	e.g. van Niewenhuizen"""
	ref_line = u"""[55] Hawking S., P. van Niewenhuizen, L.S. Durkin, D. Freeman, some title of some journal"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">55</subfield>
	<subfield code="h">Hawking S., P. van Niewenhuizen, L.S. Durkin, D. Freeman</subfield>
	</datafield>
	</record>""")

	def test_authors_coma_but_no_journal(self):
	"""2 authors separated by coma"""
	ref_line = u"""[56] Hawking S., D. Freeman, some title of some journal"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">56</subfield>
	<subfield code="h">Hawking S., D. Freeman</subfield>
	</datafield>
	</record>""")

	def test_authors_and_but_no_journal(self):
	"""2 authors separated by "and" """
	ref_line = u"""[57] Hawking S. and D. Freeman, another random title of some random journal"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">57</subfield>
	<subfield code="h">Hawking S. and D. Freeman</subfield>
	</datafield>
	</record>""")

	def test_simple_et_al(self):
	"""author ending with et al."""
	ref_line = u"""[1] Amaldi et al., Phys. Rev. D36, 1385 (1987)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">Amaldi et al.</subfield>
	<subfield code="s">Phys.Rev.,D36,1385</subfield>
	<subfield code="y">1987</subfield>
	</datafield>
	</record>""")

	def test_ibid_two_journals(self):
	"""IBIDEM test

	ibidem must copy the previous reference journal and not
	the first one
	"""
	ref_line = u"""[58] Nucl. Phys. B342, 15 (1990); Phys. Lett. B261, 146 (1991); ibidem B263, 459 (1991);"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">58</subfield>
	<subfield code="s">Nucl.Phys.,B342,15</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">58</subfield>
	<subfield code="s">Phys.Lett.,B261,146</subfield>
	<subfield code="y">1991</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">58</subfield>
	<subfield code="s">Phys.Lett.,B263,459</subfield>
	<subfield code="y">1991</subfield>
	</datafield>
	</record>""")

	def test_collaboration(self):
	"""collaboration"""
	ref_line = u"""[60] HERMES Collaboration, Airapetian A et al. 2005 Phys. Rev. D 71 012003 1-36"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">60</subfield>
	<subfield code="c">HERMES Collaboration</subfield>
	<subfield code="h">Airapetian A et al.</subfield>
	<subfield code="s">Phys.Rev.,D71,012003</subfield>
	<subfield code="y">2005</subfield>
	</datafield>
	</record>""")

	def test_weird_number_after_volume(self):
	ref_line = u"""[61] de Florian D, Sassot R and Stratmann M 2007 Phys. Rev. D 75 114010 1-26"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">61</subfield>
	<subfield code="h">de Florian D, Sassot R and Stratmann M</subfield>
	<subfield code="s">Phys.Rev.,D75,114010</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_year_before_journal(self):
	ref_line = u"""[64] Bourrely C, Soffer J and Buccella F 2002 Eur. Phys. J. C 23 487-501"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">64</subfield>
	<subfield code="h">Bourrely C, Soffer J and Buccella F</subfield>
	<subfield code="s">Eur.Phys.J.,C23,487</subfield>
	<subfield code="y">2002</subfield>
	</datafield>
	</record>""")

	def test_non_recognized_reference(self):
	ref_line = u"""[63] Z. Guzik and R. Jacobsson, LHCb Readout Supervisor ’ODIN’ with a L1\nTrigger - Technical reference, Aug 2005, EDMS 704078-V1.0"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">63</subfield>
	<subfield code="h">Z. Guzik and R. Jacobsson</subfield>
	</datafield>
	</record>""")

	def test_year_stuck_to_volume(self):
	ref_line = u"""[65] K. Huang, Am. J. Phys. 20, 479(1952)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">65</subfield>
	<subfield code="h">K. Huang</subfield>
	<subfield code="s">Am.J.Phys.,20,479</subfield>
	<subfield code="y">1952</subfield>
	</datafield>
	</record>""")

	def test_two_initials_after_surname(self):
	"""Author with 2 initials
	e.g. Pate S. F."""
	ref_line = u"""[62] Pate S. F., McKee D. W. and Papavassiliou V. 2008 Phys.Rev. C 78 448"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">62</subfield>
	<subfield code="h">Pate S. F., McKee D. W. and Papavassiliou V.</subfield>
	<subfield code="s">Phys.Rev.,C78,448</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_one_initial_after_surname(self):
	"""Author with 1 initials
	e.g. Pate S."""
	ref_line = u"""[62] Pate S., McKee D., 2008 Phys.Rev. C 78 448"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">62</subfield>
	<subfield code="h">Pate S., McKee D.</subfield>
	<subfield code="s">Phys.Rev.,C78,448</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_two_initials_no_dot_after_surname(self):
	"""Author with 2 initials
	e.g. Pate S F"""
	ref_line = u"""[62] Pate S F, McKee D W and Papavassiliou V 2008 Phys.Rev. C 78 448"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">62</subfield>
	<subfield code="h">Pate S F, McKee D W and Papavassiliou V</subfield>
	<subfield code="s">Phys.Rev.,C78,448</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_one_initial_no_dot_after_surname(self):
	"""Author with 1 initials
	e.g. Pate S"""
	ref_line = u"""[62] Pate S, McKee D, 2008 Phys.Rev. C 78 448"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">62</subfield>
	<subfield code="h">Pate S, McKee D</subfield>
	<subfield code="s">Phys.Rev.,C78,448</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_two_initials_before_surname(self):
	ref_line = u"""[67] G. A. Perkins, Found. Phys. 6, 237(1976)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">67</subfield>
	<subfield code="h">G. A. Perkins</subfield>
	<subfield code="s">Found.Phys.,6,237</subfield>
	<subfield code="y">1976</subfield>
	</datafield>
	</record>""")

	def test_one_initial_before_surname(self):
	ref_line = u"""[67] G. Perkins, Found. Phys. 6, 237(1976)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">67</subfield>
	<subfield code="h">G. Perkins</subfield>
	<subfield code="s">Found.Phys.,6,237</subfield>
	<subfield code="y">1976</subfield>
	</datafield>
	</record>""")

	def test_two_initials_no_dot_before_surname(self):
	ref_line = u"""[67] G A Perkins, Found. Phys. 6, 237(1976)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">67</subfield>
	<subfield code="h">G A Perkins</subfield>
	<subfield code="s">Found.Phys.,6,237</subfield>
	<subfield code="y">1976</subfield>
	</datafield>
	</record>""")

	def test_one_initial_no_dot_before_surname(self):
	ref_line = u"""[67] G Perkins, Found. Phys. 6, 237(1976)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">67</subfield>
	<subfield code="h">G Perkins</subfield>
	<subfield code="s">Found.Phys.,6,237</subfield>
	<subfield code="y">1976</subfield>
	</datafield>
	</record>""")

	def test_ibid_twice(self):
	ref_line = u"""[68] A. O. Barut et al, Phys. Rev. D23, 2454(1981); ibid, D24, 3333(1981); ibid, D31, 1386(1985)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">68</subfield>
	<subfield code="h">A. O. Barut et al.</subfield>
	<subfield code="s">Phys.Rev.,D23,2454</subfield>
	<subfield code="y">1981</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">68</subfield>
	<subfield code="h">A. O. Barut et al.</subfield>
	<subfield code="s">Phys.Rev.,D24,3333</subfield>
	<subfield code="y">1981</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">68</subfield>
	<subfield code="h">A. O. Barut et al.</subfield>
	<subfield code="s">Phys.Rev.,D31,1386</subfield>
	<subfield code="y">1985</subfield>
	</datafield>
	</record>""")

	def test_no_authors(self):
	ref_line = u"""[69] Phys. Rev. Lett. 52, 2009(1984)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">69</subfield>
	<subfield code="s">Phys.Rev.Lett.,52,2009</subfield>
	<subfield code="y">1984</subfield>
	</datafield>
	</record>""")

	def test_extra_01(self):
	"Parsed erroniously as Phys.Rev.Lett.,101,01"
	ref_line = u"""[17] de Florian D, Sassot R, Stratmann M and Vogelsang W 2008 Phys. Rev. Lett. 101 072001 1-4; 2009 Phys.
	Rev. D 80 034030 1-25"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">17</subfield>
	<subfield code="h">de Florian D, Sassot R, Stratmann M and Vogelsang W</subfield>
	<subfield code="s">Phys.Rev.Lett.,101,072001</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">17</subfield>
	<subfield code="s">Phys.Rev.,D80,034030</subfield>
	<subfield code="y">2009</subfield>
	</datafield>
	</record>""")

	def test_extra_no_after_vol(self):
	ref_line = u"""[130] A. Kuper, H. Letaw, L. Slifkin, E-Sonder, and C. T. Tomizuka, “Self- diffusion in copper,” Physical Review, vol. 96, no. 5, pp. 1224–1225, 1954."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">130</subfield>
	<subfield code="h">A. Kuper, H. Letaw, L. Slifkin, E-Sonder, and C. T. Tomizuka</subfield>
	<subfield code="t">Self- diffusion in copper</subfield>
	<subfield code="s">Phys.Rev.,96,1224</subfield>
	<subfield code="y">1954</subfield>
	</datafield>
	</record>""")

	def test_jinst(self):
	ref_line = u"""[1] ATLAS Collaboration, G. Aad et al., The ATLAS Experiment at the CERN Large Hadron Collider, JINST 3 (2008) S08003."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="c">ATLAS Collaboration</subfield>
	<subfield code="h">G. Aad et al.</subfield>
	<subfield code="s">JINST,3,S08003</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_collaboration2(self):
	ref_line = u"""[28] Particle Data Group Collaboration, K. Nakamura et al., Review of particle physics, J. Phys. G37 (2010) 075021."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">28</subfield>
	<subfield code="c">Particle Data Group Collaboration</subfield>
	<subfield code="h">K. Nakamura et al.</subfield>
	<subfield code="s">J.Phys.,G37,075021</subfield>
	<subfield code="y">2010</subfield>
	</datafield>
	</record>""")

	def test_sub_volume(self):
	ref_line = u"""[8] S. Horvat, D. Khartchenko, O. Kortner, S. Kotov, H. Kroha, A. Manz, S. Mohrdieck-Mock, K. Nikolaev, R. Richter, W. Stiller, C. Valderanis, J. Dubbert, F. Rauscher, and A. Staude, Operation of the ATLAS muon drift-tube chambers at high background rates and in magnetic fields, IEEE Trans. Nucl. Sci. 53 (2006) no. 2, 562–566"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">8</subfield>
	<subfield code="h">S. Horvat, D. Khartchenko, O. Kortner, S. Kotov, H. Kroha, A. Manz, S. Mohrdieck-Mock, K. Nikolaev, R. Richter, W. Stiller, C. Valderanis, J. Dubbert, F. Rauscher, and A. Staude</subfield>
	<subfield code="s">IEEE Trans.Nucl.Sci.,53,562</subfield>
	<subfield code="y">2006</subfield>
	</datafield>
	</record>""")

	def test_journal_not_recognized(self):
	ref_line = u"""[33] A. Moraes, C. Buttar, and I. Dawson, Prediction for minimum bias and the underlying event at LHC energies, The European Physical Journal C - Particles and Fields 50 (2007) 435–466."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">33</subfield>
	<subfield code="h">A. Moraes, C. Buttar, and I. Dawson</subfield>
	<subfield code="s">Eur.Phys.J.,C50,435</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_multiple_eds(self):
	ref_line = u"""[7] L. Evans, (ed.) and P. Bryant, (ed.), LHC Machine, JINST 3 (2008) S08001."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">7</subfield>
	<subfield code="h">L. Evans, (ed.) and P. Bryant, (ed.)</subfield>
	<subfield code="s">JINST,3,S08001</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_atlas_conf(self):
	"""not recognizing preprint format"""
	ref_line = u"""[32] The ATLAS Collaboration, Charged particle multiplicities in pp interactions at √s = 0.9 and 7 TeV in a diffractive limited phase space measured with the ATLAS detector at the LHC and a new pythia6 tune, 2010. http://cdsweb.cern.ch/record/1266235/files/ ATLAS-COM-CONF-2010-031.pdf. ATLAS-CONF-2010-031."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">32</subfield>
	<subfield code="c">ATLAS Collaboration</subfield>
	<subfield code="u">http://cdsweb.cern.ch/record/1266235/files/ATLAS-COM-CONF-2010-031.pdf</subfield>
	<subfield code="r">ATLAS-CONF-2010-031</subfield>
	</datafield>
	</record>""")

	def test_journal_of_physics(self):
	"""eventually not recognizing the journal, the collaboration or authors"""
	ref_line = u"""[19] ATLAS Inner Detector software group Collaboration, T. Cornelissen, M. Elsing, I. Gavilenko, W. Liebig, E. Moyse, and A. Salzburger, The new ATLAS Track Reconstruction (NEWT), Journal of Physics 119 (2008) 032014."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">19</subfield>
	<subfield code="c">ATLAS Inner Detector software group Collaboration</subfield>
	<subfield code="h">T. Cornelissen, M. Elsing, I. Gavilenko, W. Liebig, E. Moyse, and A. Salzburger</subfield>
	<subfield code="s">J.Phys.,119,032014</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_jhep(self):
	"""was splitting JHEP in JHE: P"""
	ref_line = u"""[22] G. P. Salam and G. Soyez, A practical seedless infrared-safe cone jet algorithm, JHEP 05 (2007) 086."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">22</subfield>
	<subfield code="h">G. P. Salam and G. Soyez</subfield>
	<subfield code="s">JHEP,0705,086</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_journal_not_recognized2(self):
	ref_line = u"""[3] Physics Performance Report Vol 1 – J. Phys. G. Vol 30 N° 11 (2004) 232"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">3</subfield>
	<subfield code="s">J.Phys.,G30,232</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	</record>""")

	def test_journal_not_recognized3(self):
	ref_line = u"""[3] Physics Performance Report Vol 1 – J. Phys. G. N° 30 (2004) 232"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">3</subfield>
	<subfield code="s">J.Phys.,G30,232</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	</record>""")

	def test_journal_not_recognized4(self):
	ref_line = u"""[128] D. P. Pritzkau and R. H. Siemann, “Experimental study of rf pulsed heat- ing on oxygen free electronic copper,” Physical Review Special Topics - Accelerators and Beams, vol. 5, pp. 1–22, 2002."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">128</subfield>
	<subfield code="h">D. P. Pritzkau and R. H. Siemann</subfield>
	<subfield code="t">Experimental study of rf pulsed heat- ing on oxygen free electronic copper</subfield>
	<subfield code="s">Phys.Rev.ST Accel.Beams,5,1</subfield>
	<subfield code="y">2002</subfield>
	</datafield>
	</record>""")

	def test_journal_not_recognized5(self):
	ref_line = u"""[128] D. P. Pritzkau and R. H. Siemann, Phys.Lett. 100B (1981), 117"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">128</subfield>
	<subfield code="h">D. P. Pritzkau and R. H. Siemann</subfield>
	<subfield code="s">Phys.Lett.,B100,117</subfield>
	<subfield code="y">1981</subfield>
	</datafield>
	</record>""")

	def test_note_format1(self):
	ref_line = u"""[91] S. Calatroni, H. Neupert, and M. Taborelli, “Fatigue testing of materials by UV pulsed laser irradiation,” CLIC Note 615, CERN, 2004."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">91</subfield>
	<subfield code="h">S. Calatroni, H. Neupert, and M. Taborelli</subfield>
	<subfield code="t">Fatigue testing of materials by UV pulsed laser irradiation</subfield>
	<subfield code="r">CERN-CLIC-Note-615</subfield>
	</datafield>
	</record>""")

	def test_note_format2(self):
	ref_line = u"""[5] H. Braun, R. Corsini, J. P. Delahaye, A. de Roeck, S. Dbert, A. Ferrari, G. Geschonke, A. Grudiev, C. Hauviller, B. Jeanneret, E. Jensen, T. Lefvre, Y. Papaphilippou, G. Riddone, L. Rinolfi, W. D. Schlatter, H. Schmickler, D. Schulte, I. Syratchev, M. Taborelli, F. Tecker, R. Toms, S. Weisz, and W. Wuensch, “CLIC 2008 parameters,” tech. rep., CERN CLIC-Note-764, Oct 2008."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">5</subfield>
	<subfield code="h">H. Braun, R. Corsini, J. P. Delahaye, A. de Roeck, S. Dbert, A. Ferrari, G. Geschonke, A. Grudiev, C. Hauviller, B. Jeanneret, E. Jensen, T. Lefvre, Y. Papaphilippou, G. Riddone, L. Rinolfi, W. D. Schlatter, H. Schmickler, D. Schulte, I. Syratchev, M. Taborelli, F. Tecker, R. Toms, S. Weisz, and W. Wuensch</subfield>
	<subfield code="t">CLIC 2008 parameters</subfield>
	<subfield code="r">CERN-CLIC-Note-764</subfield>
	</datafield>
	</record>""")

	def test_remove_empty_misc_tag(self):
	ref_line = u"""[21] “http://www.linearcollider.org/.”"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">21</subfield>
	<subfield code="u">http://www.linearcollider.org/</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_sub_volume_not_recognized(self):
	ref_line = u"""[37] L. Lu, Y. Shen, X. Chen, L. Qian, and K. Lu, “Ultrahigh strength and high electrical conductivity in copper,” Science, vol. 304, no. 5669, pp. 422–426, 2004."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">37</subfield>
	<subfield code="h">L. Lu, Y. Shen, X. Chen, L. Qian, and K. Lu</subfield>
	<subfield code="t">Ultrahigh strength and high electrical conductivity in copper</subfield>
	<subfield code="s">Science,304,422</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	</record>""")

	def test_extra_a_after_journal(self):
	ref_line = u"""[28] Particle Data Group Collaboration, K. Nakamura et al., Review of particle physics, J. Phys. G37 (2010) 075021."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">28</subfield>
	<subfield code="c">Particle Data Group Collaboration</subfield>
	<subfield code="h">K. Nakamura et al.</subfield>
	<subfield code="s">J.Phys.,G37,075021</subfield>
	<subfield code="y">2010</subfield>
	</datafield>
	</record>""")

	def test_full_month_with_volume(self):
	ref_line = u"""[2] C. Rubbia, Experimental observation of the intermediate vector bosons W+, W−, and Z0, Reviews of Modern Physics 57 (July, 1985) 699–722."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2</subfield>
	<subfield code="h">C. Rubbia</subfield>
	<subfield code="s">Rev.Mod.Phys.,57,699</subfield>
	<subfield code="y">1985</subfield>
	</datafield>
	</record>""")

	def test_wrong_replacement(self):
	"""Wrong replacement

	A. J. Hey, Gauge by Astron.J. Hey
	"""
	ref_line = u"""[5] I. J. Aitchison and A. J. Hey, Gauge Theories in Particle Physics, Vol II: QCD and the Electroweak Theory. CRC Pr I Llc, 2003."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">5</subfield>
	<subfield code="h">I. J. Aitchison and A. J. Hey</subfield>
	<subfield code="p">CRC Pr.</subfield>
	</datafield>
	</record>""")

	def test_author_replacement(self):
	ref_line = u"""[48] D. Adams, S. Asai, D. Cavalli, M. Du ̈hrssen, K. Edmonds, S. Elles, M. Fehling, U. Felzmann, L. Gladilin, L. Helary, M. Hohlfeld, S. Horvat, K. Jakobs, M. Kaneda, G. Kirsch, S. Kuehn, J. F. Marchand, C. Pizio, X. Portell, D. Rebuzzi, E. Schmidt, A. Shibata, I. Vivarelli, S. Winkelmann, and S. Yamamoto, The ATLFAST-II performance in release 14 -particle signatures and selected benchmark processes-, Tech. Rep. ATL-PHYS-INT-2009-110, CERN, Geneva, Dec, 2009."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">48</subfield>
	<subfield code="h">D. Adams, S. Asai, D. Cavalli, M. D\xfchrssen, K. Edmonds, S. Elles, M. Fehling, U. Felzmann, L. Gladilin, L. Helary, M. Hohlfeld, S. Horvat, K. Jakobs, M. Kaneda, G. Kirsch, S. Kuehn, J. F. Marchand, C. Pizio, X. Portell, D. Rebuzzi, E. Schmidt, A. Shibata, I. Vivarelli, S. Winkelmann, and S. Yamamoto</subfield>
	<subfield code="r">ATL-PHYS-INT-2009-110</subfield>
	</datafield>
	</record>""")

	def test_author_not_recognized1(self):
	ref_line = u"""[7] Pod I., C. Jennings, et al, etc., Nucl. Phys. B342, 15 (1990)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">7</subfield>
	<subfield code="h">Pod I., C. Jennings, et al.</subfield>
	<subfield code="s">Nucl.Phys.,B342,15</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_title_comma(self):
	ref_line = u"""[24] R. Downing et al., Nucl. Instrum. Methods, A570, 36 (2007)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">24</subfield>
	<subfield code="h">R. Downing et al.</subfield>
	<subfield code="s">Nucl.Instrum.Meth.,A570,36</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_author1(self):
	ref_line = u"""[43] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990);"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">43</subfield>
	<subfield code="h">L.S. Durkin and P. Langacker</subfield>
	<subfield code="s">Phys.Lett.,B166,436</subfield>
	<subfield code="y">1986</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">43</subfield>
	<subfield code="h">Amaldi et al.</subfield>
	<subfield code="s">Phys.Rev.,D36,1385</subfield>
	<subfield code="y">1987</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">43</subfield>
	<subfield code="h">Hayward and Yellow et al.</subfield>
	<subfield code="s">Phys.Lett.,B245,669</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">43</subfield>
	<subfield code="s">Nucl.Phys.,B342,15</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_author2(self):
	ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); T.G. Rizzo, Phys. Rev. D40, 3035 (1989); Proceedings of the 1990 Summer Study on High Energy Physics. ed E. Berger, June 25-July 13, 1990, Snowmass Colorado (World Scientific, Singapore, 1992) p. 233; V. Barger, J.L. Hewett and T.G. Rizzo, Phys. Rev. D42, 152 (1990); J.L. Hewett, Phys. Lett. B238, 98 (1990)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="s">Nucl.Phys.,B372,3</subfield>
	<subfield code="y">1992</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">T.G. Rizzo</subfield>
	<subfield code="s">Phys.Rev.,D40,3035</subfield>
	<subfield code="y">1989</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">(E. Berger (eds.))</subfield>
	<subfield code="p">World Scientific</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">V. Barger, J.L. Hewett and T.G. Rizzo</subfield>
	<subfield code="s">Phys.Rev.,D42,152</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">J.L. Hewett</subfield>
	<subfield code="s">Phys.Lett.,B238,98</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_merging(self):
	"""Test how references are merged together

	We may choose to merge invalid references to the previous one"""
	ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); T.G. Rizzo, Phys. Rev. D40, 3035 (1989); Proceedings of the 1990 Summer Study on High Energy Physics; ed E. Berger; V. Barger, J.L. Hewett and T.G. Rizzo ; Phys. Rev. D42, 152 (1990); J.L. Hewett, Phys. Lett. B238, 98 (1990)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="s">Nucl.Phys.,B372,3</subfield>
	<subfield code="y">1992</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">T.G. Rizzo</subfield>
	<subfield code="s">Phys.Rev.,D40,3035</subfield>
	<subfield code="y">1989</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="m">Proceedings of the 1990 Summer Study on High Energy Physics</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">(E. Berger (eds.))</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">V. Barger, J.L. Hewett and T.G. Rizzo</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="s">Phys.Rev.,D42,152</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">J.L. Hewett</subfield>
	<subfield code="s">Phys.Lett.,B238,98</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_merging2(self):
	ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); hello world"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="s">Nucl.Phys.,B372,3</subfield>
	<subfield code="y">1992</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="m">hello world</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_merging3(self):
	ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); hello world T.G. Rizzo foo"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="s">Nucl.Phys.,B372,3</subfield>
	<subfield code="y">1992</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="m">hello world foo</subfield>
	<subfield code="h">T.G. Rizzo</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_merging4(self):
	ref_line = u"""[15] T.G. Rizzo; Nucl. Phys., B372, 3 (1992)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="h">T.G. Rizzo</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">15</subfield>
	<subfield code="s">Nucl.Phys.,B372,3</subfield>
	<subfield code="y">1992</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_merging5(self):
	ref_line = u"""[39] C. Arnaboldi et al., Nucl. Instrum. Meth. A 518 (2004) 775
	[hep-ex/0212053]; M. Sisti [CUORE Collaboration], J. Phys. Conf. Ser. 203 (2010)
	012069; F. Bellini, C. Bucci, S. Capelli, O. Cremonesi, L. Gironi, M. Martinez, M. Pavan
	and C. Tomei et al., Astropart. Phys. 33 (2010) 169 [arXiv:0912.0452 [physics.ins-det]]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">C. Arnaboldi et al.</subfield>
	<subfield code="s">Nucl.Instrum.Meth.,A518,775</subfield>
	<subfield code="r">hep-ex/0212053</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">M. Sisti</subfield>
	<subfield code="c">CUORE Collaboration</subfield>
	<subfield code="m">J. Phys. Conf. Ser. 203 (2010)012069</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">F. Bellini, C. Bucci, S. Capelli, O. Cremonesi, L. Gironi, M. Martinez, M. Pavanand C. Tomei et al.</subfield>
	<subfield code="m">Astropart. Phys. 33 (2010) 169</subfield>
	<subfield code="r">arXiv:0912.0452 [physics.ins-det]</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_extra_blank_reference(self):
	ref_line = u"""[26] U. Gursoy and E. Kiritsis, “Exploring improved holographic theories for QCD: Part I,” JHEP 0802 (2008) 032 [ArXiv:0707.1324][hep-th]; U. Gursoy, E. Kiritsis and F. Nitti, “Exploring improved holographic theories for QCD: Part II,” JHEP 0802 (2008) 019 [ArXiv:0707.1349][hep-th];"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">26</subfield>
	<subfield code="h">U. Gursoy and E. Kiritsis</subfield>
	<subfield code="t">Exploring improved holographic theories for QCD: Part I</subfield>
	<subfield code="s">JHEP,0802,032</subfield>
	<subfield code="r">arXiv:0707.1324</subfield>
	<subfield code="m">[hep-th]</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">26</subfield>
	<subfield code="h">U. Gursoy, E. Kiritsis and F. Nitti</subfield>
	<subfield code="t">Exploring improved holographic theories for QCD: Part II</subfield>
	<subfield code="s">JHEP,0802,019</subfield>
	<subfield code="r">arXiv:0707.1349</subfield>
	<subfield code="m">[hep-th]</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_invalid_author(self):
	"""used to detected invalid author as at Finite T"""
	ref_line = u"""[23] A. Taliotis, “qq ̄ Potential at Finite T and Weak Coupling in N = 4,” Phys. Rev. C83, 045204 (2011). [ArXiv:1011.6618][hep-th]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">23</subfield>
	<subfield code="h">A. Taliotis</subfield>
	<subfield code="t">qq \u0304 Potential at Finite T and Weak Coupling in N = 4</subfield>
	<subfield code="s">Phys.Rev.,C83,045204</subfield>
	<subfield code="r">arXiv:1011.6618</subfield>
	<subfield code="y">2011</subfield>
	</datafield>
	</record>""")

	def test_split_arxiv(self):
	"""used to split arxiv reference from its reference"""
	ref_line = u"""[18] A. Taliotis, “DIS from the AdS/CFT correspondence,” Nucl. Phys. A830, 299C-302C (2009). [ArXiv:0907.4204][hep-th]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">18</subfield>
	<subfield code="h">A. Taliotis</subfield>
	<subfield code="t">DIS from the AdS/CFT correspondence</subfield>
	<subfield code="s">Nucl.Phys.,A830,299C</subfield>
	<subfield code="r">arXiv:0907.4204</subfield>
	<subfield code="y">2009</subfield>
	</datafield>
	</record>""")

	def test_report_without_dash(self):
	ref_line = u"""[20] G. Duckeck et al., “ATLAS computing: Technical design report,” CERN-LHCC2005-022."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">20</subfield>
	<subfield code="h">G. Duckeck et al.</subfield>
	<subfield code="t">ATLAS computing: Technical design report</subfield>
	<subfield code="r">CERN-LHCC-2005-022</subfield>
	</datafield>
	</record>""")

	def test_report_with_slashes(self):
	ref_line = u"""[20] G. Duckeck et al., “ATLAS computing: Technical design report,” CERN/LHCC/2005-022."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">20</subfield>
	<subfield code="h">G. Duckeck et al.</subfield>
	<subfield code="t">ATLAS computing: Technical design report</subfield>
	<subfield code="r">CERN-LHCC-2005-022</subfield>
	</datafield>
	</record>""")

	def test_ed_before_et_al(self):
	ref_line = u"""[20] G. Duckeck, (ed. ) et al., “ATLAS computing: Technical design report,” CERN-LHCC-2005-022."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">20</subfield>
	<subfield code="h">G. Duckeck, (ed.) et al.</subfield>
	<subfield code="t">ATLAS computing: Technical design report</subfield>
	<subfield code="r">CERN-LHCC-2005-022</subfield>
	</datafield>
	</record>""")

	def test_journal_but_no_page(self):
	ref_line = u"""[20] G. Duckeck, “ATLAS computing: Technical design report,” JHEP,03,1988"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">20</subfield>
	<subfield code="h">G. Duckeck</subfield>
	<subfield code="t">ATLAS computing: Technical design report</subfield>
	</datafield>
	</record>""")

	def test_isbn1(self):
	ref_line = u"""[22] B. Crowell, Vibrations and Waves. www.lightandmatter.com, 2009. ISBN 0-9704670-3-6."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">22</subfield>
	<subfield code="h">B. Crowell</subfield>
	<subfield code="i">0-9704670-3-6</subfield>
	</datafield>
	</record>""")

	def test_isbn2(self):
	ref_line = u"""[119] D. E. Gray, American Institute of Physics Handbook. Mcgraw-Hill, 3rd ed., 1972. ISBN 9780070014855."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">119</subfield>
	<subfield code="h">D. E. Gray</subfield>
	<subfield code="p">McGraw-Hill</subfield>
	<subfield code="i">9780070014855</subfield>
	</datafield>
	</record>""")

	def test_book(self):
	ref_line = u"""[1] D. Griffiths, “Introduction to elementary particles,” Weinheim, USA: Wiley-VCH (2008) 454 p."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">D. Griffiths</subfield>
	<subfield code="p">Wiley-VCH</subfield>
	<subfield code="t">Introduction to elementary particles</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_complex_arxiv(self):
	ref_line = u"""[4] J.Prat, arXiv:1012.3675v1 [physics.ins-det]"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="h">J.Prat</subfield>
	<subfield code="r">arXiv:1012.3675 [physics.ins-det]</subfield>
	</datafield>
	</record>""")

	def test_new_arxiv(self):
	ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [0802.2879]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">178</subfield>
	<subfield code="h">D. R. Tovey</subfield>
	<subfield code="s">JHEP,0804,034</subfield>
	<subfield code="r">arXiv:0802.2879</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_new_arxiv2(self):
	ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [9112.2879]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">178</subfield>
	<subfield code="h">D. R. Tovey</subfield>
	<subfield code="s">JHEP,0804,034</subfield>
	<subfield code="r">arXiv:9112.2879</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_new_arxiv3(self):
	ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [1212.2879]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">178</subfield>
	<subfield code="h">D. R. Tovey</subfield>
	<subfield code="s">JHEP,0804,034</subfield>
	<subfield code="r">arXiv:1212.2879</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_new_arxiv_invalid(self):
	ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [9002.2879]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">178</subfield>
	<subfield code="h">D. R. Tovey</subfield>
	<subfield code="s">JHEP,0804,034</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_new_arxiv_invalid2(self):
	ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [9113.2879]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">178</subfield>
	<subfield code="h">D. R. Tovey</subfield>
	<subfield code="s">JHEP,0804,034</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_special_journals(self):
	ref_line = u"""[178] D. R. Tovey, JHEP 04 (2008) 034"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">178</subfield>
	<subfield code="h">D. R. Tovey</subfield>
	<subfield code="s">JHEP,0804,034</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_unrecognized_author(self):
	ref_line = u"""[27] B. Feng, Y. -H. He, P. Fre', "On correspondences between toric singularities and (p,q) webs," Nucl. Phys. B701 (2004) 334-356. [hep-th/0403133]"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">27</subfield>
	<subfield code="h">B. Feng, Y. -H. He, P. Fre'</subfield>
	<subfield code="t">On correspondences between toric singularities and (p,q) webs</subfield>
	<subfield code="s">Nucl.Phys.,B701,334</subfield>
	<subfield code="r">hep-th/0403133</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	</record>""")

	def test_unrecognized_author2(self):
	ref_line = u"""[75] J. M. Figueroa-O’Farrill, J. M. Figueroa-O'Farrill, C. M. Hull and B. J. Spence, "Branes at conical singularities and holography," Adv. Theor. Math. Phys. 2, 1249 (1999) [arXiv:hep-th/9808014]"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">75</subfield>
	<subfield code="h">J. M. Figueroa-O’Farrill, J. M. Figueroa-O'Farrill, C. M. Hull and B. J. Spence</subfield>
	<subfield code="t">Branes at conical singularities and holography</subfield>
	<subfield code="s">Adv.Theor.Math.Phys.,2,1249</subfield>
	<subfield code="r">hep-th/9808014</subfield>
	<subfield code="y">1999</subfield>
	</datafield>
	</record>""")

	def test_pos(self):
	ref_line = u"""[23] M. A. Donnellan, et al., PoS LAT2007 (2007) 369."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">23</subfield>
	<subfield code="h">M. A. Donnellan, et al.</subfield>
	<subfield code="s">PoS,LAT2007,369</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_pos2(self):
	ref_line = u"""[23] M. A. Donnellan, et al., PoS LAT2007 2007 369."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">23</subfield>
	<subfield code="h">M. A. Donnellan, et al.</subfield>
	<subfield code="s">PoS,LAT2007,369</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_pos3(self):
	ref_line = u"""[23] M. A. Donnellan, et al., PoS(LAT2005)239."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">23</subfield>
	<subfield code="h">M. A. Donnellan, et al.</subfield>
	<subfield code="s">PoS,LAT2005,239</subfield>
	<subfield code="y">2005</subfield>
	</datafield>
	</record>""")

	def test_pos4(self):
	ref_line = u"""[23] PoS CHARGED 2010, 030 (2010)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">23</subfield>
	<subfield code="s">PoS,CHARGED2010,030</subfield>
	<subfield code="y">2010</subfield>
	</datafield>
	</record>""")


	def test_complex_author(self):
	ref_line = u"""[39] Michael E. Peskin, Michael E. Peskin and Michael E. Peskin “An Introduction To Quantum Field Theory,” Westview Press, 1995."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">Michael E. Peskin, Michael E. Peskin and Michael E. Peskin</subfield>
	<subfield code="t">An Introduction To Quantum Field Theory</subfield>
	</datafield>
	</record>""")

	def test_complex_author2(self):
	ref_line = u"""[39] Dan V. Schroeder, Dan V. Schroeder and Dan V. Schroeder “An Introduction To Quantum Field Theory,” Westview Press, 1995."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">Dan V. Schroeder, Dan V. Schroeder and Dan V. Schroeder</subfield>
	<subfield code="t">An Introduction To Quantum Field Theory</subfield>
	</datafield>
	</record>""")

	def test_dan_journal(self):
	ref_line = u"""[39] Michael E. Peskin and Dan V. Schroeder “An Introduction To Quantum Field Theory,” Westview Press, 1995."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">Michael E. Peskin and Dan V. Schroeder</subfield>
	<subfield code="t">An Introduction To Quantum Field Theory</subfield>
	</datafield>
	</record>""")

	def test_dan_journal2(self):
	ref_line = u"""[39] Dan V. Schroeder DAN B701 (2004) 334-356"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">39</subfield>
	<subfield code="h">Dan V. Schroeder</subfield>
	<subfield code="s">Dokl.Akad.Nauk Ser.Fiz.,B701,334</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	</record>""")

	def test_query_in_url(self):
	ref_line = u"""[69] ATLAS Collaboration. Mutag. http://indico.cern.ch/getFile.py/access?contribId=9&resId=1&materialId=slides&confId=35502"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">69</subfield>
	<subfield code="c">ATLAS Collaboration</subfield>
	<subfield code="u">http://indico.cern.ch/getFile.py/access?contribId=9&resId=1&materialId=slides&confId=35502</subfield>
	</datafield>
	</record>""")

	def test_volume_colon_page(self):
	ref_line = u"""[77] J. M. Butterworth et al. Multiparton interactions in photoproduction at hera. Z.Phys.C72:637-646,1996."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">77</subfield>
	<subfield code="h">J. M. Butterworth et al.</subfield>
	<subfield code="s">Z.Phys.,C72,637</subfield>
	<subfield code="y">1996</subfield>
	</datafield>
	</record>""")

	def test_no_spaces_numeration(self):
	ref_line = u"""[1] I.M. Gregor et al, Optical links for the ATLAS SCT and Pixel detector, Z.Phys. 465(2001)131-134"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">I.M. Gregor et al.</subfield>
	<subfield code="s">Z.Phys.,465,131</subfield>
	<subfield code="y">2001</subfield>
	</datafield>
	</record>""")

	def test_dot_after_year(self):
	ref_line = u"""[1] Neutrino Mass and New Physics, Phys.Rev. 2006. 56:569-628"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="s">Phys.Rev.,56,569</subfield>
	<subfield code="y">2006</subfield>
	</datafield>
	</record>""")

	def test_journal_roman(self):
	ref_line = u"""[19] D. Page and C. Pope, Commun. Math. Phys. VI (1990) 529."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">19</subfield>
	<subfield code="h">D. Page and C. Pope</subfield>
	<subfield code="s">Commun.Math.Phys.,6,529</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_journal_phys_rev_d(self):
	ref_line = u"""[6] Sivers D. W., Phys. Rev.D, 41 (1990) 83"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="s">Phys.Rev.,D41,83</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_publisher(self):
	ref_line = u"""[6] Sivers D. W., BrAnS Hello"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="p">Brans</subfield>
	</datafield>
	</record>""")

	def test_hep_formatting(self):
	ref_line = u"""[6] Sivers D. W., hep-ph-9711200"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="r">hep-ph/9711200</subfield>
	</datafield>
	</record>""")

	def test_hep_formatting2(self):
	ref_line = u"""[6] Sivers D. W., astro-ph-9711200"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="r">astro-ph/9711200</subfield>
	</datafield>
	</record>""")

	def test_nucl_phys_b_removal(self):
	ref_line = u"""[6] Sivers D. W., Nucl. Phys. (Proc.Suppl.) B21 (2004) 334-356"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="s">Nucl.Phys.Proc.Suppl.,21,334</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	</record>""")

	def test_citations_splitting(self):
	ref_line = u"""[6] Sivers D. W., CERN-EX-0106015, D. Page, CERN-EX-0104007"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="r">CERN-EX-0106015</subfield>
	<subfield code="0">1</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">D. Page</subfield>
	<subfield code="r">CERN-EX-0104007</subfield>
	<subfield code="0">2</subfield>
	</datafield>
	</record>""")

	def test_citations_splitting2(self):
	ref_line = u"""[6] Sivers D. W., hep-ex/0201013, D. Page, CERN-EP-2001-094"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="r">hep-ex/0201013</subfield>
	<subfield code="r">CERN-EP-2001-094</subfield>
	<subfield code="0">10</subfield>
	</datafield>
	</record>""")

	def test_arxiv_report_number(self):
	"""Should be recognized by arxiv regexps list

	(not in report-numbers.kb)
	"""
	ref_line = u"""[6] Sivers D. W., math.AA/0101888"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="r">math.AA/0101888</subfield>
	</datafield>
	</record>""")

	def test_arxiv_report_number2(self):
	""": instead of / in arxiv report number"""
	ref_line = u"""[12] C. T. Hill and E. H. Simmons, Phys. Rept. 381: 235-402 (2003), Erratum-ibid. 390: 553-554 (2004) [arXiv: hep-ph:0203079]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="h">C. T. Hill and E. H. Simmons</subfield>
	<subfield code="r">hep-ph/0203079</subfield>
	</datafield>
	</record>""")

	def test_arxiv_report_number3(self):
	""": instead of / in arxiv report number"""
	ref_line = u"""[12] hep-ph/0203079v1"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="r">hep-ph/0203079</subfield>
	</datafield>
	</record>""")

	def test_arxiv_report_number4(self):
	""": instead of / in arxiv report number"""
	ref_line = u"""[12] hep-ph/0203079invalid"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="m">hep-ph/0203079invalid</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_arxiv_not_parsed(self):
	ref_line = u"""[12] arXiv: 0701034 [hep-ph]"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="r">hep-ph/0701034</subfield>
	</datafield>
	</record>""")

	def test_arxiv_report_number_replacement(self):
	"""Should be replaced by a valid arxiv report number"""
	ref_line = u"""[6] Sivers D. W., astro-phy/0101888"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Sivers D. W.</subfield>
	<subfield code="r">astro-ph/0101888</subfield>
	</datafield>
	</record>""")

	def test_only_report_number(self):
	ref_line = u"""[6] ATL-PHYS-INT-2009-110"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="r">ATL-PHYS-INT-2009-110</subfield>
	</datafield>
	</record>""")

	def test_only_journal(self):
	ref_line = u"""[6] Phys. Rev.D, 41 (1990) 83"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="s">Phys.Rev.,D41,83</subfield>
	<subfield code="y">1990</subfield>
	</datafield>
	</record>""")

	def test_only_doi(self):
	ref_line = u"""[6] doi:10.1007/s10440-008-9280-9"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="a">10.1007/s10440-008-9280-9</subfield>
	</datafield>
	</record>""")

	def test_reference_size_limit_check_valid_in_one_line(self):
	from invenio.refextract_api import extract_references_from_string
	ref_line = u"""[1] D. Adams, S. Asai, D. Cavalli, K. Edmonds,
	The ATLFAST-II performance in release 14,
	Tech. Rep. ATL-PHYS-INT-2009-110, CERN, Geneva, Dec, 2009.
	[2] D. Adams, ATL-PHYS-INT-2009-111"""
	record = extract_references_from_string(ref_line)
	compare_references(self, record, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">D. Adams, S. Asai, D. Cavalli, K. Edmonds</subfield>
	<subfield code="r">ATL-PHYS-INT-2009-110</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2</subfield>
	<subfield code="h">D. Adams</subfield>
	<subfield code="r">ATL-PHYS-INT-2009-111</subfield>
	</datafield>
	</record>""")

	def test_reference_size_limit_but_removed_as_invalid(self):
	"""Test the removal of references that are more than n lines long

	Needs to match test_reference_size_limit_check_valid_in_one_line
	above but be on multiple lines
	"""
	from invenio.refextract_api import extract_references_from_string
	ref_line = u"""[1] D. Adams, S. Asai, D. Cavalli, K. Edmonds,
	a\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\n
	a\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\n
	The ATLFAST-II performance in release 14,
	Tech. Rep. ATL-PHYS-INT-2009-110, CERN, Geneva, Dec, 2009.
	[2] D. Adams, ATL-PHYS-INT-2009-111"""
	record = extract_references_from_string(ref_line)
	compare_references(self, record, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">D. Adams, S. Asai, D. Cavalli, K. Edmonds</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2</subfield>
	<subfield code="h">D. Adams</subfield>
	<subfield code="r">ATL-PHYS-INT-2009-111</subfield>
	</datafield>
	</record>""")

	def test_author_tag_inside_quoted(self):
	"""Tests embeded tags in quoted text

	We want to avoid this
	<cds.QUOTED>Electroweak parameters of the Z0 resonance and the Standard
	Model <cds.AUTHincl>the LEP Collaborations</cds.AUTHincl></cds.QUOTED>
	"""
	ref_line = u"""[10] LEP Collaboration, G. Alexander et al., “Electroweak parameters of the Z0 resonance and the Standard Model: the LEP Collaborations,” Phys. Lett. B276 (1992) 247–253."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">10</subfield>
	<subfield code="c">LEP Collaboration</subfield>
	<subfield code="h">G. Alexander et al.</subfield>
	<subfield code="t">Electroweak parameters of the Z0 resonance and the Standard Model: the LEP Collaborations</subfield>
	<subfield code="s">Phys.Lett.,B276,247</subfield>
	<subfield code="y">1992</subfield>
	</datafield>
	</record>""")

	def test_misparsing_arxiv(self):
	ref_line = u"""[21] R. Barlow, Asymmetric errors, eConf C030908 (2003), arXiv:physics/0401042."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">21</subfield>
	<subfield code="h">R. Barlow</subfield>
	<subfield code="r">physics/0401042</subfield>
	</datafield>
	</record>""")

	def test_no_volume(self):
	ref_line = u"""[6] Owen F.N., Rudnick L., 1976, Phys. Rev., 205L, 1"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Owen F.N., Rudnick L.</subfield>
	<subfield code="s">Phys.Rev.,L205,1</subfield>
	<subfield code="y">1976</subfield>
	</datafield>
	</record>""")

	def test_numeration_detached(self):
	"""Numeration detection check

	At some point was reporting two journals, detecting twice the same
	numeration
	"""
	ref_line = u"""[6] B. Friman, in The CBM Phys. Rev. book: Compressed baryonic matter in laboratory, Phys. Rev. 814, 1 (2011)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">B. Friman</subfield>
	<subfield code="s">Phys.Rev.,814,1</subfield>
	<subfield code="y">2011</subfield>
	</datafield>
	</record>""")

	def test_no_volume2(self):
	"""At some point failed to report volume correctly"""
	ref_line = u"""[3] S. Sarkar, Nucl. Phys. A 862-863, 13 (2011)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">3</subfield>
	<subfield code="h">S. Sarkar</subfield>
	<subfield code="s">Nucl.Phys.,A862,13</subfield>
	<subfield code="y">2011</subfield>
	</datafield>
	</record>""")

	def test_journal_title_mangled(self):
	"""Makes sure this journal gets confused with an author"""
	ref_line = u"""[12] K. G. Chetyrkin and A. Khodjamirian, Eur. Phys. J. C46 (2006)
	721"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">12</subfield>
	<subfield code="h">K. G. Chetyrkin and A. Khodjamirian</subfield>
	<subfield code="s">Eur.Phys.J.,C46,721</subfield>
	<subfield code="y">2006</subfield>
	</datafield>
	</record>""")

	def test_volume_letter_goes_missing(self):
	ref_line = u"""[6] N. Cabibbo and G. Parisi, Phys. Lett. 59 B (1975) 67."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">N. Cabibbo and G. Parisi</subfield>
	<subfield code="s">Phys.Lett.,B59,67</subfield>
	<subfield code="y">1975</subfield>
	</datafield>
	</record>""")

	def test_removed_dot_in_authors(self):
	ref_line = u"""[6] Cabibbo N. and Parisi G.: Phys. Lett. 59 B (1975) 67."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">6</subfield>
	<subfield code="h">Cabibbo N. and Parisi G.</subfield>
	<subfield code="s">Phys.Lett.,B59,67</subfield>
	<subfield code="y">1975</subfield>
	</datafield>
	</record>""")

	def test_author_with_accents(self):
	ref_line = u"""[1] Ôrlo A., Eur. Phys. J. C46 (2006) 721"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">Ôrlo A.</subfield>
	<subfield code="s">Eur.Phys.J.,C46,721</subfield>
	<subfield code="y">2006</subfield>
	</datafield>
	</record>""")

	def test_implied_ibid(self):
	ref_line = u"""[4] S. F. King and G. G. Ross, Phys. Lett. B 520, 243 (2001); 574, 239 (2003)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="h">S. F. King and G. G. Ross</subfield>
	<subfield code="s">Phys.Lett.,B520,243</subfield>
	<subfield code="y">2001</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="s">Phys.Lett.,B574,239</subfield>
	<subfield code="y">2003</subfield>
	</datafield>
	</record>""")

	def test_implied_ibid2(self):
	ref_line = u"""[4] S. F. King and G. G. Ross, Phys. Lett. B 520, 243 (2001); C574, 239 (2003)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="h">S. F. King and G. G. Ross</subfield>
	<subfield code="s">Phys.Lett.,B520,243</subfield>
	<subfield code="y">2001</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="s">Phys.Lett.,C574,239</subfield>
	<subfield code="y">2003</subfield>
	</datafield>
	</record>""")

	def test_implied_ibid3(self):
	ref_line = u"""[4] S. F. King and G. G. Ross, Phys. Lett. B 520, 243 (2001); 574, 239 (2003); 575, 240 (2004); 576, 241 (2005)"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="h">S. F. King and G. G. Ross</subfield>
	<subfield code="s">Phys.Lett.,B520,243</subfield>
	<subfield code="y">2001</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="s">Phys.Lett.,B574,239</subfield>
	<subfield code="y">2003</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="s">Phys.Lett.,B575,240</subfield>
	<subfield code="y">2004</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">4</subfield>
	<subfield code="s">Phys.Lett.,B576,241</subfield>
	<subfield code="y">2005</subfield>
	</datafield>
	</record>""")

	def test_implied_ibid4(self):
	ref_line = u"""[10] R. Foot, H.N. Long and T.A. Tran, Phys. Rev. D50, R34 (1994); H.N. Long, ibid. 53, 437 (1996); 54, 4691 (1996)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">10</subfield>
	<subfield code="h">R. Foot, H.N. Long and T.A. Tran</subfield>
	<subfield code="s">Phys.Rev.,D50,R34</subfield>
	<subfield code="y">1994</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">10</subfield>
	<subfield code="h">H.N. Long</subfield>
	<subfield code="s">Phys.Rev.,D53,437</subfield>
	<subfield code="y">1996</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">10</subfield>
	<subfield code="s">Phys.Rev.,D54,4691</subfield>
	<subfield code="y">1996</subfield>
	</datafield>
	</record>""")

	def test_report_number(self):
	ref_line = u"""[10] [physics.plasm-ph/0409093]."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">10</subfield>
	<subfield code="r">physics.plasm-ph/0409093</subfield>
	</datafield>
	</record>""")

	def test_journal2(self):
	ref_line = u"""[1] Phys.Rev. A, : 78 (2008) 012115"""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="s">Phys.Rev.,A78,012115</subfield>
	<subfield code="y">2008</subfield>
	</datafield>
	</record>""")

	def test_authors_merge(self):
	ref_line = u"""[44] R. Baier et al., Invalid. Hello. Lett. B 345 (1995)."""
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">44</subfield>
	<subfield code="h">R. Baier et al.</subfield>
	<subfield code="m">Invalid. Hello. Lett. B 345 (1995)</subfield>
	</datafield>
	</record>""", ignore_misc=False)

	def test_atlas_conf_99(self):
	ref_line = u'[14] ATLAS-CONF-99-078'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="r">ATL-CONF-99-078</subfield>
	</datafield>
	</record>""")

	def test_atlas_conf_pre_2010(self):
	ref_line = u'[14] ATL-CONF-2003-078'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="r">ATL-CONF-2003-078</subfield>
	</datafield>
	</record>""")

	def test_atlas_conf_pre_2010_2(self):
	ref_line = u'[14] ATLAS-CONF-2003-078'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="r">ATL-CONF-2003-078</subfield>
	</datafield>
	</record>""")

	def test_atlas_conf_post_2010(self):
	ref_line = u'[14] ATLAS-CONF-2012-078'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="r">ATLAS-CONF-2012-078</subfield>
	</datafield>
	</record>""")

	def test_atlas_conf_post_2010_2(self):
	ref_line = u'[14] ATL-CONF-2012-078'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	<subfield code="r">ATLAS-CONF-2012-078</subfield>
	</datafield>
	</record>""")

	def test_atlas_conf_post_2010_invalid(self):
	ref_line = u'[14] ATL-CONF-2012-0784'
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">14</subfield>
	</datafield>
	</record>""")

	def test_journal_missed(self):
	ref_line = u"[1] M. G. Mayer, Phys. Rev. 75 (1949), 1969; O. Hazel, J. H. D. Jensen, and H. E. Suess, Phys. Rev. 75 (1949), 1766."
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">M. G. Mayer</subfield>
	<subfield code="s">Phys.Rev.,75,1969</subfield>
	<subfield code="y">1949</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1</subfield>
	<subfield code="h">O. Hazel, J. H. D. Jensen, and H. E. Suess</subfield>
	<subfield code="s">Phys.Rev.,75,1766</subfield>
	<subfield code="y">1949</subfield>
	</datafield>
	</record>""")

	def test_invalid_publisher(self):
	"""test_invalid_publisher

	This needs to not consider the lbl in Hoelbling as a publisher"""
	ref_line = u"[35] G. I. Egri, Z. Fodor, C. Hoelbling, S. D. Katz, D. Nógrádi, et. al., Lattice QCD as a video game, Comput.Phys.Commun. 177 (2007) 631–639, [hep-lat/0611022]."
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">35</subfield>
	<subfield code="h">G. I. Egri, Z. Fodor, C. Hoelbling, S. D. Katz, D. N\xf3gr\xe1di, et al.</subfield>
	<subfield code="r">hep-lat/0611022</subfield>
	</datafield>
	</record>""")

	def test_valid_publisher(self):
	"""test_invalid_publisher

	This needs to not consider the lbl in Hoelbling as a publisher"""
	ref_line = u"[35] [LBL]"
	_reference_test(self, ref_line, u"""<record>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">35</subfield>
	<subfield code="p">LBL</subfield>
	</datafield>
	</record>""")

	def test_missed_collaboration(self):
	ref_line = u"""[76] these results replace the Λb → J/ψΛ and B0 → J/ψKS lifetime measurements of A. Abulencia et al. (CDF collaboration), Phys. Rev. Lett. 98, 122001 (2007), arXiv:hep-ex/0609021, as well as the B0 → J/ψK∗0"""
	_reference_test(self, ref_line, u"""<record>
	<datafield ind1="C" ind2="5" tag="999">
	<subfield code="o">76</subfield>
	- <subfield code="h">Abulencia et al.</subfield>
	+ <subfield code="h">A. Abulencia et al.</subfield>
	<subfield code="c">CDF collaboration</subfield>
	<subfield code="s">Phys.Rev.Lett.,98,122001</subfield>
	<subfield code="r">hep-ex/0609021</subfield>
	<subfield code="y">2007</subfield>
	</datafield>
	</record>""")

	def test_remove_duplicate_doi(self):
	ref_line = u"""[1] doi:10.1007/s10440-008-9280-9 doi:10.1007/s10440-008-9280-9"""
	_reference_test(self, ref_line, u"""<record>
	<datafield ind1="C" ind2="5" tag="999">
	<subfield code="o">1</subfield>
	<subfield code="a">10.1007/s10440-008-9280-9</subfield>
	</datafield>
	</record>""")

	def test_leftover_tag(self):
	- ref_line = u"""[2] ΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦ E. Dudas, G. von Gersdorff, J. Parmentier and S. Pokorski, arXiv:1007.5208."""
	+ ref_line = u"""[2] ΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦ^ E. Dudas, G. von Gersdorff, J. Parmentier and S. Pokorski, arXiv:1007.5208."""
	_reference_test(self, ref_line, u"""<record>
	<datafield ind1="C" ind2="5" tag="999">
	<subfield code="o">2</subfield>
	<subfield code="h">E. Dudas, G. von Gersdorff, J. Parmentier and S. Pokorski</subfield>
	<subfield code="r">arXiv:1007.5208</subfield>
	</datafield>
	</record>""")

	+ def test_leftover_number(self):
	+ """test_leftover_number
	+
	+ The result was
	+ <datafield ind1="C" ind2="5" tag="999">
	+ <subfield code="o">2</subfield>
	+ <subfield code="m">9</subfield>
	+ <subfield code="r">CERN-LHCC-2011-999</subfield>
	+ </datafield>
	+ """
	+ ref_line = u"""[2] CERN-LHCC2011-999"""
	+ _reference_test(self, ref_line, u"""<record>
	+ <datafield ind1="C" ind2="5" tag="999">
	+ <subfield code="o">2</subfield>
	+ <subfield code="r">CERN-LHCC-2011-999</subfield>
	+ </datafield>
	+</record>""", ignore_misc=False)
	+
	+ def test_leftover_brackets(self):
	+ ref_line = u"""[2] [CERN-LHCC2011-999]"""
	+ _reference_test(self, ref_line, u"""<record>
	+ <datafield ind1="C" ind2="5" tag="999">
	+ <subfield code="o">2</subfield>
	+ <subfield code="r">CERN-LHCC-2011-999</subfield>
	+ </datafield>
	+</record>""", ignore_misc=False)
	+
	+ def test_valid_utf_8(self):
	+ """Checks that the utf-8 characters are kept"""
	+ ref_line = u"""[2] "Λb → J/ψΛ and B0 → J/ψKS" """
	+ _reference_test(self, ref_line, u"""<record>
	+ <datafield ind1="C" ind2="5" tag="999">
	+ <subfield code="o">2</subfield>
	+ <subfield code="t">Λb → J/ψΛ and B0 → J/ψKS</subfield>
	+ </datafield>
	+</record>""")
	+

	class TaskTest(InvenioTestCase):
	def setUp(self):
	setup_loggers(verbosity=0)

	def test_task_run_core(self):
	from invenio.refextract_task import task_run_core
	task_run_core(1)

	TEST_SUITE = make_test_suite(RefextractTest)
	if __name__ == '__main__':
	run_test_suite(TEST_SUITE, warn_user=True)
	diff --git a/modules/docextract/lib/refextract_tag.py b/modules/docextract/lib/refextract_tag.py
	index 204303672..6f9b02c26 100644
	--- a/modules/docextract/lib/refextract_tag.py
	+++ b/modules/docextract/lib/refextract_tag.py
	@@ -1,1405 +1,1402 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	import re

	from unidecode import unidecode

	from invenio.refextract_config import \
	CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, \
	CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, \
	CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \
	CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, \
	CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID, \
	CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION, \
	CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION

	from invenio.docextract_text import remove_and_record_multiple_spaces_in_line

	from invenio.refextract_re import \
	re_ibid, \
	re_doi, \
	re_raw_url, \
	re_series_from_numeration, \
	re_punctuation, \
	re_correct_numeration_2nd_try_ptn1, \
	re_correct_numeration_2nd_try_ptn2, \
	re_correct_numeration_2nd_try_ptn3, \
	re_correct_numeration_2nd_try_ptn4, \
	re_numeration_nucphys_vol_page_yr, \
	re_numeration_vol_subvol_nucphys_yr_page, \
	re_numeration_nucphys_vol_yr_page, \
	re_multiple_hyphens, \
	re_numeration_vol_page_yr, \
	re_numeration_vol_yr_page, \
	re_numeration_vol_nucphys_series_yr_page, \
	re_numeration_vol_series_nucphys_page_yr, \
	re_numeration_vol_nucphys_series_page_yr, \
	re_html_tagged_url, \
	re_numeration_yr_vol_page, \
	re_numeration_vol_nucphys_page_yr, \
	re_wash_volume_tag, \
	re_numeration_vol_nucphys_yr_subvol_page, \
	re_quoted, \
	re_isbn, \
	re_arxiv, \
	re_new_arxiv, \
	re_pos, \
	re_pos_year_num, \
	re_series_from_numeration_after_volume, \
	RE_OLD_ARXIV, \
	RE_ARXIV_CATCHUP, \
	RE_ATLAS_CONF_PRE_2010, \
	RE_ATLAS_CONF_POST_2010

	from invenio.authorextract_re import re_auth, \
	re_auth_near_miss, \
	re_etal, \
	etal_matches, \
	re_ed_notation


	from invenio.docextract_text import wash_line


	def tag_reference_line(line, kbs, record_titles_count):
	# take a copy of the line as a first working line, clean it of bad
	# accents, and correct puncutation, etc:
	working_line1 = wash_line(line)

	# Identify volume for POS journal
	working_line1 = tag_pos_volume(working_line1)

	# Clean the line once more:
	working_line1 = wash_line(working_line1)

	# We identify quoted text
	# This is useful for books matching
	# This is also used by the author tagger to remove quoted
	# text which is a sign of a title and not an author
	working_line1 = tag_quoted_text(working_line1)

	# Identify ISBN (for books)
	working_line1 = tag_isbn(working_line1)

	# Identify arxiv reports
	working_line1 = tag_arxiv(working_line1)
	working_line1 = tag_arxiv_more(working_line1)
	# Identify volume for POS journal
	# needs special handling because the volume contains the year
	working_line1 = tag_pos_volume(working_line1)
	# Identify ATL-CONF and ATLAS-CONF report numbers
	# needs special handling because it has 2 formats depending on the year
	# and a 2 years digit format to convert
	working_line1 = tag_atlas_conf(working_line1)

	# Identify journals with regular expression
	# Some journals need to match exact regexps because they can
	# conflict with other elements
	# e.g. DAN is also a common first name
	standardised_titles = kbs['journals'][1]
	standardised_titles.update(kbs['journals_re'])
	journals_matches = identifiy_journals_re(working_line1, kbs['journals_re'])

	# Remove identified tags
	working_line2 = strip_tags(working_line1)

	# Transform the line to upper-case, now making a new working line:
	working_line2 = working_line2.upper()

	# Strip punctuation from the line:
	working_line2 = re_punctuation.sub(u' ', working_line2)

	# Remove multiple spaces from the line, recording
	# information about their coordinates:
	removed_spaces, working_line2 = \
	remove_and_record_multiple_spaces_in_line(working_line2)

	# Identify and record coordinates of institute preprint report numbers:
	found_pprint_repnum_matchlens, found_pprint_repnum_replstr, working_line2 =\
	identify_report_numbers(working_line2, kbs['report-numbers'])

	# Identify and record coordinates of non-standard journal titles:
	journals_matches_more, working_line2, line_titles_count = \
	identify_journals(working_line2, kbs['journals'])
	journals_matches.update(journals_matches_more)

	# Add the count of 'bad titles' found in this line to the total
	# for the reference section:
	record_titles_count = sum_2_dictionaries(record_titles_count,
	line_titles_count)

	# Attempt to identify, record and replace any IBIDs in the line:
	if (working_line2.upper().find(u"IBID") != -1):
	# there is at least one IBID in the line - try to
	# identify its meaning:
	found_ibids_matchtext, working_line2 = \
	identify_ibids(working_line2)
	# now update the dictionary of matched title lengths with the
	# matched IBID(s) lengths information:
	journals_matches.update(found_ibids_matchtext)

	publishers_matches = identify_publishers(working_line2, kbs['publishers'])

	tagged_line = process_reference_line(
	working_line=working_line1,
	journals_matches=journals_matches,
	pprint_repnum_len=found_pprint_repnum_matchlens,
	pprint_repnum_matchtext=found_pprint_repnum_replstr,
	publishers_matches=publishers_matches,
	removed_spaces=removed_spaces,
	standardised_titles=standardised_titles,
	kbs=kbs,
	)

	return tagged_line, record_titles_count


	def process_reference_line(working_line,
	journals_matches,
	pprint_repnum_len,
	pprint_repnum_matchtext,
	publishers_matches,
	removed_spaces,
	standardised_titles,
	kbs):
	"""After the phase of identifying and tagging citation instances
	in a reference line, this function is called to go through the
	line and the collected information about the recognised citations,
	and to transform the line into a string of MARC XML in which the
	recognised citations are grouped under various datafields and
	subfields, depending upon their type.
	@param line_marker: (string) - this is the marker for this
	reference line (e.g. [1]).
	@param working_line: (string) - this is the line before the
	punctuation was stripped. At this stage, it has not been
	capitalised, and neither TITLES nor REPORT NUMBERS have been
	stripped from it. However, any recognised numeration and/or URLs
	have been tagged with <cds.YYYY> tags.
	The working_line could, for example, look something like this:
	[1] CDS <cds.URL description="http //invenio-software.org/">
	http //invenio-software.org/</cds.URL>.
	@param found_title_len: (dictionary) - the lengths of the title
	citations that have been recognised in the line. Keyed by the index
	within the line of each match.
	@param found_title_matchtext: (dictionary) - The text that was found
	for each matched title citation in the line. Keyed by the index within
	the line of each match.
	@param pprint_repnum_len: (dictionary) - the lengths of the matched
	institutional preprint report number citations found within the line.
	Keyed by the index within the line of each match.
	@param pprint_repnum_matchtext: (dictionary) - The matched text for each
	matched institutional report number. Keyed by the index within the line
	of each match.
	@param identified_dois (list) - The list of dois inside the citation
	@identified_urls: (list) - contains 2-cell tuples, each of which
	represents an idenitfied URL and its description string.
	The list takes the order in which the URLs were identified in the line
	(i.e. first-found, second-found, etc).
	@param removed_spaces: (dictionary) - The number of spaces removed from
	the various positions in the line. Keyed by the index of the position
	within the line at which the spaces were removed.
	@param standardised_titles: (dictionary) - The standardised journal
	titles, keyed by the non-standard version of those titles.
	@return: (tuple) of 5 components:
	( string -> a MARC XML-ized reference line.
	integer -> number of fields of miscellaneous text marked-up
	for the line.
	integer -> number of title citations marked-up for the line.
	integer -> number of institutional report-number citations
	marked-up for the line.
	integer -> number of URL citations marked-up for the record.
	integer -> number of DOI's found for the record
	integer -> number of author groups found
	)

	"""
	if len(journals_matches) + len(pprint_repnum_len) + len(publishers_matches) == 0:
	# no TITLE or REPORT-NUMBER citations were found within this line,
	# use the raw line: (This 'raw' line could still be tagged with
	# recognised URLs or numeration.)
	tagged_line = working_line
	else:
	# TITLE and/or REPORT-NUMBER citations were found in this line,
	# build a new version of the working-line in which the standard
	# versions of the REPORT-NUMBERs and TITLEs are tagged:
	startpos = 0 # First cell of the reference line...
	previous_match = {} # previously matched TITLE within line (used
	# for replacement of IBIDs.
	replacement_types = {}
	journals_keys = journals_matches.keys()
	journals_keys.sort()
	reports_keys = pprint_repnum_matchtext.keys()
	reports_keys.sort()
	publishers_keys = publishers_matches.keys()
	publishers_keys.sort()
	spaces_keys = removed_spaces.keys()
	spaces_keys.sort()
	replacement_types = get_replacement_types(journals_keys,
	reports_keys,
	publishers_keys)
	replacement_locations = replacement_types.keys()
	replacement_locations.sort()

	tagged_line = u"" # This is to be the new 'working-line'. It will
	# contain the tagged TITLEs and REPORT-NUMBERs,
	# as well as any previously tagged URLs and
	# numeration components.
	# begin:
	for replacement_index in replacement_locations:
	# first, factor in any stripped spaces before this 'replacement'
	true_replacement_index, extras = \
	account_for_stripped_whitespace(spaces_keys,
	removed_spaces,
	replacement_types,
	pprint_repnum_len,
	journals_matches,
	replacement_index)

	if replacement_types[replacement_index] == u"journal":
	# Add a tagged periodical TITLE into the line:
	rebuilt_chunk, startpos, previous_match = \
	add_tagged_journal(
	reading_line=working_line,
	journal_info=journals_matches[replacement_index],
	previous_match=previous_match,
	startpos=startpos,
	true_replacement_index=true_replacement_index,
	extras=extras,
	standardised_titles=standardised_titles)
	tagged_line += rebuilt_chunk

	elif replacement_types[replacement_index] == u"reportnumber":
	# Add a tagged institutional preprint REPORT-NUMBER
	# into the line:
	rebuilt_chunk, startpos = \
	add_tagged_report_number(
	reading_line=working_line,
	len_reportnum=pprint_repnum_len[replacement_index],
	reportnum=pprint_repnum_matchtext[replacement_index],
	startpos=startpos,
	true_replacement_index=true_replacement_index,
	extras=extras)
	tagged_line += rebuilt_chunk

	elif replacement_types[replacement_index] == u"publisher":
	rebuilt_chunk, startpos = \
	add_tagged_publisher(
	reading_line=working_line,
	matched_publisher=publishers_matches[replacement_index],
	startpos=startpos,
	true_replacement_index=true_replacement_index,
	extras=extras,
	kb_publishers=kbs['publishers'])
	tagged_line += rebuilt_chunk

	# add the remainder of the original working-line into the rebuilt line:
	tagged_line += working_line[startpos:]

	# we have all the numeration
	# we can make sure there's no space between the volume
	# letter and the volume number
	# e.g. B 20 -> B20
	tagged_line = wash_volume_tag(tagged_line)

	# Try to find any authors in the line
	tagged_line = identify_and_tag_authors(tagged_line, kbs['authors'])
	# Try to find any collaboration in the line
	tagged_line = identify_and_tag_collaborations(tagged_line,
	kbs['collaborations'])

	return tagged_line.replace('\n', '')


	def wash_volume_tag(line):
	return re_wash_volume_tag[0].sub(re_wash_volume_tag[1], line)


	def tag_isbn(line):
	"""Tag books ISBN"""
	return re_isbn.sub(ur'<cds.ISBN>\g<code></cds.ISBN>', line)


	def tag_quoted_text(line):
	"""Tag quoted titles

	We use titles for pretty display of references that we could not
	associate we record.
	We also use titles for recognising books.
	"""
	return re_quoted.sub(ur'<cds.QUOTED>\g<title></cds.QUOTED>', line)


	def tag_arxiv(line):
	"""Tag arxiv report numbers

	We handle arXiv in 2 ways:
	* starting with arXiv:1022.1111
	* this format exactly 9999.9999
	We also format the output to the standard arxiv notation:
	* arXiv:2007.12.1111
	* arXiv:2007.12.1111v2
	"""
	def tagger(match):
	groups = match.groupdict()
	if match.group('suffix'):
	groups['suffix'] = ' ' + groups['suffix']
	else:
	groups['suffix'] = ''
	return u'<cds.REPORTNUMBER>arXiv:%(year)s'\
	u'%(month)s.%(num)s%(suffix)s' \
	u'</cds.REPORTNUMBER>' % groups

	line = re_arxiv.sub(tagger, line)
	line = re_new_arxiv.sub(tagger, line)
	return line


	def tag_arxiv_more(line):
	"""Tag old arxiv report numbers

	Either formats:
	* hep-th/1234567
	* arXiv:1022111 [hep-ph] which transforms to hep-ph/1022111
	"""
	line = RE_ARXIV_CATCHUP.sub(ur"\g<suffix>/\g<year>\g<month>\g<num>", line)

	for report_re, report_repl in RE_OLD_ARXIV:
	report_number = report_repl + ur"/\g<num>"
	line = report_re.sub(u'<cds.REPORTNUMBER>' + report_number \
	+ u'</cds.REPORTNUMBER>',
	line)
	return line


	def tag_pos_volume(line):
	"""Tag POS volume number

	POS is journal that has special volume numbers
	e.g. PoS LAT2007 (2007) 369
	"""
	def tagger(match):
	groups = match.groupdict()
	try:
	year = match.group('year')
	except IndexError:
	# Extract year from volume name
	# which should always include the year
	g = re.search(re_pos_year_num, match.group('volume_num'), re.UNICODE)
	year = g.group(0)

	if year:
	groups['year'] = ' <cds.YR>(%s)</cds.YR>' % year.strip().strip('()')
	else:
	groups['year'] = ''

	return '<cds.JOURNAL>PoS</cds.JOURNAL>' \
	' <cds.VOL>%(volume_name)s%(volume_num)s</cds.VOL>' \
	'%(year)s' \
	' <cds.PG>%(page)s</cds.PG>' % groups

	for p in re_pos:
	line = p.sub(tagger, line)

	return line


	def tag_atlas_conf(line):
	line = RE_ATLAS_CONF_PRE_2010.sub(
	ur'<cds.REPORTNUMBER>ATL-CONF-\g<code></cds.REPORTNUMBER>', line)
	line = RE_ATLAS_CONF_POST_2010.sub(
	ur'<cds.REPORTNUMBER>ATLAS-CONF-\g<code></cds.REPORTNUMBER>', line)
	return line


	def identifiy_journals_re(line, kb_journals):
	matches = {}
	for pattern, dummy in kb_journals:
	match = re.search(pattern, line)
	if match:
	matches[match.start()] = match.group(0)
	return matches


	def find_numeration_more(line):
	"""Look for other numeration in line."""
	# First, attempt to use marked-up titles
	patterns = (
	re_correct_numeration_2nd_try_ptn1,
	re_correct_numeration_2nd_try_ptn2,
	re_correct_numeration_2nd_try_ptn3,
	re_correct_numeration_2nd_try_ptn4,
	)
	for pattern in patterns:
	match = pattern.search(line)
	if match:
	info = match.groupdict()
	series = extract_series_from_volume(info['vol'])
	if not info['vol_num']:
	info['vol_num'] = info['vol_num_alt']
	if not info['vol_num']:
	info['vol_num'] = info['vol_num_alt2']
	return {'year': info.get('year', None),
	'series': series,
	'volume': info['vol_num'],
	'page': info['page'],
	'len': len(info['aftertitle'])}

	return None


	def add_tagged_report_number(reading_line,
	len_reportnum,
	reportnum,
	startpos,
	true_replacement_index,
	extras):
	"""In rebuilding the line, add an identified institutional REPORT-NUMBER
	(standardised and tagged) into the line.
	@param reading_line: (string) The reference line before capitalization
	was performed, and before REPORT-NUMBERs and TITLEs were stipped out.
	@param len_reportnum: (integer) the length of the matched REPORT-NUMBER.
	@param reportnum: (string) the replacement text for the matched
	REPORT-NUMBER.
	@param startpos: (integer) the pointer to the next position in the
	reading-line from which to start rebuilding.
	@param true_replacement_index: (integer) the replacement index of the
	matched REPORT-NUMBER in the reading-line, with stripped punctuation
	and whitespace accounted for.
	@param extras: (integer) extras to be added into the replacement index.
	@return: (tuple) containing a string (the rebuilt line segment) and an
	integer (the next 'startpos' in the reading-line).
	"""
	rebuilt_line = u"" # The segment of the line that's being rebuilt to
	# include the tagged & standardised REPORT-NUMBER

	# Fill rebuilt_line with the contents of the reading_line up to the point
	# of the institutional REPORT-NUMBER. However, stop 1 character before the
	# replacement index of this REPORT-NUMBER to allow for removal of braces,
	# if necessary:
	if (true_replacement_index - startpos - 1) >= 0:
	rebuilt_line += reading_line[startpos:true_replacement_index - 1]
	else:
	rebuilt_line += reading_line[startpos:true_replacement_index]

	- # check to see whether the REPORT-NUMBER was enclosed within brackets;
	- # drop them if so:
	- if reading_line[true_replacement_index - 1] not in (u"[", u"("):
	- # no braces enclosing the REPORT-NUMBER:
	- rebuilt_line += reading_line[true_replacement_index - 1]
	-
	# Add the tagged REPORT-NUMBER into the rebuilt-line segment:
	rebuilt_line += u"<cds.REPORTNUMBER>%(reportnum)s</cds.REPORTNUMBER>" \
	% {'reportnum' : reportnum}

	# Move the pointer in the reading-line past the current match:
	startpos = true_replacement_index + len_reportnum + extras

	# Move past closing brace for report number (if there was one):
	try:
	if reading_line[startpos] in (u"]", u")"):
	startpos += 1
	except IndexError:
	# moved past end of line - ignore
	pass

	# return the rebuilt-line segment and the pointer to the next position in
	# the reading-line from which to start rebuilding up to the next match:
	return rebuilt_line, startpos


	def add_tagged_journal_in_place_of_IBID(previous_match):
	"""In rebuilding the line, if the matched TITLE was actually an IBID, this
	function will replace it with the previously matched TITLE, and add it
	into the line, tagged. It will even handle the series letter, if it
	differs. For example, if the previous match is "Nucl. Phys. B", and
	the ibid is "IBID A", the title inserted into the line will be
	"Nucl. Phys. A". Otherwise, if the IBID had no series letter, it will
	simply be replaced by "Nucl. Phys. B" (i.e. the previous match.)
	@param previous_match: (string) - the previously matched TITLE.
	@param ibid_series: (string) - the series of the IBID (if any).
	@return: (tuple) containing a string (the rebuilt line segment) and an
	other string (the newly updated previous-match).
	"""

	return " %s%s%s" % (CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID,
	previous_match['title'],
	CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID)


	def extract_series_from_volume(volume):
	patterns = (re_series_from_numeration,
	re_series_from_numeration_after_volume)
	for p in patterns:
	match = p.search(volume)
	if match:
	return match.group(1)
	return None


	def create_numeration_tag(info):
	if info['series']:
	series_and_volume = info['series'] + info['volume']
	else:
	series_and_volume = info['volume']
	numeration_tags = u' <cds.VOL>%s</cds.VOL>' % series_and_volume
	if info.get('year', False):
	numeration_tags += u' <cds.YR>(%(year)s)</cds.YR>' % info
	numeration_tags += u' <cds.PG>%(page)s</cds.PG>' % info
	return numeration_tags


	def add_tagged_journal(reading_line,
	journal_info,
	previous_match,
	startpos,
	true_replacement_index,
	extras,
	standardised_titles):
	"""In rebuilding the line, add an identified periodical TITLE (standardised
	and tagged) into the line.
	@param reading_line: (string) The reference line before capitalization
	was performed, and before REPORT-NUMBERs and TITLEs were stripped out.
	@param len_title: (integer) the length of the matched TITLE.
	@param matched_title: (string) the matched TITLE text.
	@param previous_match: (dict) the previous periodical TITLE citation to
	have been matched in the current reference line. It is used when
	replacing an IBID instance in the line.
	@param startpos: (integer) the pointer to the next position in the
	reading-line from which to start rebuilding.
	@param true_replacement_index: (integer) the replacement index of the
	matched TITLE in the reading-line, with stripped punctuation and
	whitespace accounted for.
	@param extras: (integer) extras to be added into the replacement index.
	@param standardised_titles: (dictionary) the standardised versions of
	periodical titles, keyed by their various non-standard versions.
	@return: (tuple) containing a string (the rebuilt line segment), an
	integer (the next 'startpos' in the reading-line), and an other string
	(the newly updated previous-match).
	"""
	old_startpos = startpos
	old_previous_match = previous_match
	skip_numeration = False
	series = None

	def skip_ponctuation(line, pos):
	# Skip past any punctuation at the end of the replacement that was
	# just made:
	try:
	while line[pos] in (".", ":", "-", ")"):
	pos += 1
	except IndexError:
	# The match was at the very end of the line
	pass

	return pos

	# Fill 'rebuilt_line' (the segment of the line that is being rebuilt to
	# include the tagged and standardised periodical TITLE) with the contents
	# of the reading-line, up to the point of the matched TITLE:
	rebuilt_line = reading_line[startpos:true_replacement_index]

	# Test to see whether a title or an "IBID" was matched:
	if journal_info.upper().find("IBID") != -1:
	# This is an IBID
	# Try to replace the IBID with a title:
	if previous_match:
	# Replace this IBID with the previous title match, if possible:
	rebuilt_line += add_tagged_journal_in_place_of_IBID(previous_match)
	series = previous_match['series']
	# Update start position for next segment of original line:
	startpos = true_replacement_index + len(journal_info) + extras
	startpos = skip_ponctuation(reading_line, startpos)
	else:
	rebuilt_line = ""
	skip_numeration = True
	else:
	if ';' in standardised_titles[journal_info]:
	title, series = \
	standardised_titles[journal_info].rsplit(';', 1)
	series = series.strip()
	previous_match = {'title': title,
	'series': series}
	else:
	title = standardised_titles[journal_info]
	previous_match = {'title': title,
	'series': None}

	# This is a normal title, not an IBID
	rebuilt_line += "<cds.JOURNAL>%s</cds.JOURNAL>" % title
	startpos = true_replacement_index + len(journal_info) + extras
	startpos = skip_ponctuation(reading_line, startpos)

	if not skip_numeration:
	# Check for numeration
	numeration_line = reading_line[startpos:]
	# First look for standard numeration
	numerotation_info = find_numeration(numeration_line)
	if not numerotation_info:
	numeration_line = rebuilt_line + " " + numeration_line
	# Now look for more funky numeration
	# With possibly some elements before the journal title
	numerotation_info = find_numeration_more(numeration_line)

	if not numerotation_info:
	startpos = old_startpos
	previous_match = old_previous_match
	rebuilt_line = ""
	else:
	if series and not numerotation_info['series']:
	numerotation_info['series'] = series
	startpos += numerotation_info['len']
	rebuilt_line += create_numeration_tag(numerotation_info)

	previous_match['series'] = numerotation_info['series']

	# return the rebuilt line-segment, the position (of the reading line) from
	# which the next part of the rebuilt line should be started, and the newly
	# updated previous match.
	return rebuilt_line, startpos, previous_match


	def add_tagged_publisher(reading_line,
	matched_publisher,
	startpos,
	true_replacement_index,
	extras,
	kb_publishers):
	"""In rebuilding the line, add an identified periodical TITLE (standardised
	and tagged) into the line.
	@param reading_line: (string) The reference line before capitalization
	was performed, and before REPORT-NUMBERs and TITLEs were stripped out.
	@param len_title: (integer) the length of the matched TITLE.
	@param matched_title: (string) the matched TITLE text.
	@param previous_match: (string) the previous periodical TITLE citation to
	have been matched in the current reference line. It is used when
	replacing an IBID instance in the line.
	@param startpos: (integer) the pointer to the next position in the
	reading-line from which to start rebuilding.
	@param true_replacement_index: (integer) the replacement index of the
	matched TITLE in the reading-line, with stripped punctuation and
	whitespace accounted for.
	@param extras: (integer) extras to be added into the replacement index.
	@param standardised_titles: (dictionary) the standardised versions of
	periodical titles, keyed by their various non-standard versions.
	@return: (tuple) containing a string (the rebuilt line segment), an
	integer (the next 'startpos' in the reading-line), and an other string
	(the newly updated previous-match).
	"""
	# Fill 'rebuilt_line' (the segment of the line that is being rebuilt to
	# include the tagged and standardised periodical TITLE) with the contents
	# of the reading-line, up to the point of the matched TITLE:
	rebuilt_line = reading_line[startpos:true_replacement_index]
	# This is a normal title, not an IBID
	rebuilt_line += "<cds.PUBLISHER>%(title)s</cds.PUBLISHER>" \
	% {'title' : kb_publishers[matched_publisher]['repl']}
	# Compute new start pos
	startpos = true_replacement_index + len(matched_publisher) + extras

	# return the rebuilt line-segment, the position (of the reading line) from
	# which the next part of the rebuilt line should be started, and the newly
	# updated previous match.

	return rebuilt_line, startpos


	def get_replacement_types(titles, reportnumbers, publishers):
	"""Given the indices of the titles and reportnumbers that have been
	recognised within a reference line, create a dictionary keyed by
	the replacement position in the line, where the value for each
	key is a string describing the type of item replaced at that
	position in the line.
	The description strings are:
	'title' - indicating that the replacement is a
	periodical title
	'reportnumber' - indicating that the replacement is a
	preprint report number.
	@param titles: (list) of locations in the string at which
	periodical titles were found.
	@param reportnumbers: (list) of locations in the string at which
	reportnumbers were found.
	@return: (dictionary) of replacement types at various locations
	within the string.
	"""
	rep_types = {}
	for item_idx in titles:
	rep_types[item_idx] = "journal"
	for item_idx in reportnumbers:
	rep_types[item_idx] = "reportnumber"
	for item_idx in publishers:
	rep_types[item_idx] = "publisher"
	return rep_types


	def account_for_stripped_whitespace(spaces_keys,
	removed_spaces,
	replacement_types,
	len_reportnums,
	journals_matches,
	replacement_index):
	"""To build a processed (MARC XML) reference line in which the
	recognised citations such as standardised periodical TITLEs and
	REPORT-NUMBERs have been marked up, it is necessary to read from
	the reference line BEFORE all punctuation was stripped and it was
	made into upper-case. The indices of the cited items in this
	'original line', however, will be different to those in the
	'working-line', in which punctuation and multiple-spaces were
	stripped out. For example, the following reading-line:

	[26] E. Witten and S.-T. Yau, hep-th/9910245.
	...becomes (after punctuation and multiple white-space stripping):
	[26] E WITTEN AND S T YAU HEP TH/9910245

	It can be seen that the report-number citation (hep-th/9910245) is
	at a different index in the two strings. When refextract searches
	for this citation, it uses the 2nd string (i.e. that which is
	capitalised and has no punctuation). When it builds the MARC XML
	representation of the reference line, however, it needs to read from
	the first string. It must therefore consider the whitespace,
	punctuation, etc that has been removed, in order to get the correct
	index for the cited item. This function accounts for the stripped
	characters before a given TITLE or REPORT-NUMBER index.
	@param spaces_keys: (list) - the indices at which spaces were
	removed from the reference line.
	@param removed_spaces: (dictionary) - keyed by the indices at which
	spaces were removed from the line, the values are the number of
	spaces actually removed from that position.
	So, for example, "3 spaces were removed from position 25 in
	the line."
	@param replacement_types: (dictionary) - at each 'replacement_index'
	in the line, the of replacement to make (title or reportnumber).
	@param len_reportnums: (dictionary) - the lengths of the REPORT-
	NUMBERs matched at the various indices in the line.
	@param len_titles: (dictionary) - the lengths of the various
	TITLEs matched at the various indices in the line.
	@param replacement_index: (integer) - the index in the working line
	of the identified TITLE or REPORT-NUMBER citation.
	@return: (tuple) containing 2 elements:
	+ the true replacement index of a replacement in
	the reading line;
	+ any extras to add into the replacement index;
	"""
	extras = 0
	true_replacement_index = replacement_index
	spare_replacement_index = replacement_index

	for space in spaces_keys:
	if space < true_replacement_index:
	# There were spaces stripped before the current replacement
	# Add the number of spaces removed from this location to the
	# current replacement index:
	true_replacement_index += removed_spaces[space]
	spare_replacement_index += removed_spaces[space]
	elif space >= spare_replacement_index and \
	replacement_types[replacement_index] == u"journal" and \
	space < (spare_replacement_index + \
	len(journals_matches[replacement_index])):
	# A periodical title is being replaced. Account for multi-spaces
	# that may have been stripped from the title before its
	# recognition:
	spare_replacement_index += removed_spaces[space]
	extras += removed_spaces[space]
	elif space >= spare_replacement_index and \
	replacement_types[replacement_index] == u"reportnumber" and \
	space < (spare_replacement_index + \
	len_reportnums[replacement_index]):
	# An institutional preprint report-number is being replaced.
	# Account for multi-spaces that may have been stripped from it
	# before its recognition:
	spare_replacement_index += removed_spaces[space]
	extras += removed_spaces[space]

	# return the new values for replacement indices with stripped
	# whitespace accounted for:
	return true_replacement_index, extras


	def strip_tags(line):
	# Firstly, go through and change ALL TAGS and their contents to underscores
	# author content can be checked for underscores later on
	# Note that we don't have embedded tags this is why
	# we can do this
	re_tag = re.compile(ur'<cds\.[A-Z]+>[^<]*</cds\.[A-Z]+>\|<cds\.[A-Z]+ />',
	re.UNICODE)
	for m in re_tag.finditer(line):
	chars_count = m.end() - m.start()
	line = re_tag.sub('_'*chars_count, line, count=1)
	return line


	def identify_and_tag_collaborations(line, collaborations_kb):
	"""Given a line where Authors have been tagged, and all other tags
	and content has been replaced with underscores, go through and try
	to identify extra items of data which should be placed into 'h'
	subfields.
	Later on, these tagged pieces of information will be merged into
	the content of the most recently found author. This is separated
	from the author tagging procedure since separate tags can be used,
	which won't influence the reference splitting heuristics
	(used when looking at mulitple <AUTH> tags in a line).
	"""
	for dummy, re_collab in collaborations_kb.iteritems():
	matches = re_collab.finditer(strip_tags(line))

	for match in reversed(list(matches)):
	line = line[:match.start()] \
	+ CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION \
	+ match.group(1).strip(".,:;- [](){}") \
	+ CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION \
	+ line[match.end():]

	return line


	def identify_and_tag_authors(line, authors_kb):
	"""Given a reference, look for a group of author names,
	place tags around the author group, return the newly tagged line.
	"""
	-
	# Replace authors which do not convert well from utf-8
	for pattern, repl in authors_kb:
	line = line.replace(pattern, repl)

	output_line = line

	- line = strip_tags(unidecode(line))
	- if len(line) != len(output_line):
	- output_line = unidecode(output_line)
	- line = strip_tags(output_line)
	+ # We matched authors here
	+ line = strip_tags(output_line)
	+ matched_authors = list(re_auth.finditer(line))
	+ # We try to have better results by unidecoding
	+ unidecoded_line = strip_tags(unidecode(output_line))
	+ matched_authors_unidecode = list(re_auth.finditer(unidecoded_line))

	- # Find as many author groups (collections of author names) as possible from the 'title-hidden' line
	- matched_authors = re_auth.finditer(line)
	+ if len(matched_authors_unidecode) > len(matched_authors):
	+ output_line = unidecode(output_line)
	+ matched_authors = matched_authors_unidecode

	# If there is at least one matched author group
	if matched_authors:
	matched_positions = []
	preceeding_text_string = line
	preceeding_text_start = 0
	for auth_no, match in enumerate(matched_authors):
	# Only if there are no underscores or closing arrows found in the matched author group
	# This must be checked for here, as it cannot be applied to the re without clashing with
	# other Unicode characters
	if line[match.start():match.end()].find("_") == -1:
	# Has the group with name 'et' (for 'et al') been found in the pattern?
	# Has the group with name 'es' (for ed. before the author) been found in the pattern?
	# Has the group with name 'ee' (for ed. after the author) been found in the pattern?
	matched_positions.append({
	'start' : match.start(),
	'end' : match.end(),
	'etal' : match.group('et') or match.group('et2'),
	'ed_start' : match.group('es'),
	'ed_end' : match.group('ee'),
	'multi_auth' : match.group('multi_auth'),
	'multi_surs' : match.group('multi_surs'),
	'text_before' : preceeding_text_string[preceeding_text_start:match.start()],
	'auth_no' : auth_no,
	'author_names': match.group('author_names')
	})
	# Save the end of the match, from where to snip the misc text found before an author match
	preceeding_text_start = match.end()

	# Work backwards to avoid index problems when adding AUTH tags
	matched_positions.reverse()
	for m in matched_positions:
	dump_in_misc = False
	start = m['start']
	end = m['end']

	# Check the text before the current match to see if it has a bad 'et al'
	lower_text_before = m['text_before'].strip().lower()
	for e in etal_matches:
	if lower_text_before.endswith(e):
	## If so, this author match is likely to be a bad match on a missed title
	dump_in_misc = True
	break

	# An AND found here likely indicates a missed author before this text
	# Thus, triggers weaker author searching, within the previous misc text
	# (Check the text before the current match to see if it has a bad 'and')
	# A bad 'and' will only be denoted as such if there exists only one author after it
	# and the author group is legit (not to be dumped in misc)
	if not dump_in_misc and not (m['multi_auth'] or m['multi_surs']) \
	and (lower_text_before.endswith(' and')):
	# Search using a weaker author pattern to try and find the missed author(s) (cut away the end 'and')
	weaker_match = re_auth_near_miss.match(m['text_before'])
	if weaker_match and not (weaker_match.group('es') or weaker_match.group('ee')):
	# Change the start of the author group to include this new author group
	start = start - (len(m['text_before']) - weaker_match.start())
	# Still no match, do not add tags for this author match.. dump it into misc
	else:
	dump_in_misc = True

	add_to_misc = ""
	# If a semi-colon was found at the end of this author group, keep it in misc
	# so that it can be looked at for splitting heurisitics
	if len(output_line) > m['end']:
	if output_line[m['end']].strip(" ,.") == ';':
	add_to_misc = ';'

	# Standardize eds. notation
	tmp_output_line = re.sub(re_ed_notation, '(ed.)',
	output_line[start:end], re.IGNORECASE)
	# Standardize et al. notation
	tmp_output_line = re.sub(re_etal, 'et al.',
	tmp_output_line, re.IGNORECASE)
	# Strip
	tmp_output_line = tmp_output_line.lstrip('.').strip(",:;- [](")
	if not tmp_output_line.endswith('(ed.)'):
	tmp_output_line = tmp_output_line.strip(')')

	# ONLY wrap author data with tags IF there is no evidence that it is an
	# ed. author. (i.e. The author is not referred to as an editor)
	# Does this author group string have 'et al.'?
	if m['etal'] and not (m['ed_start'] or m['ed_end'] or dump_in_misc):
	output_line = output_line[:start] \
	+ "<cds.AUTHetal>" \
	+ tmp_output_line \
	+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL \
	+ add_to_misc \
	+ output_line[end:]
	elif not (m['ed_start'] or m['ed_end'] or dump_in_misc):
	# Insert the std (standard) tag
	output_line = output_line[:start] \
	+ "<cds.AUTHstnd>" \
	+ tmp_output_line \
	+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND \
	+ add_to_misc \
	+ output_line[end:]
	# Apply the 'include in $h' method to author groups marked as editors
	elif m['ed_start'] or m['ed_end']:
	ed_notation = " (eds.)"
	# Standardize et al. notation
	tmp_output_line = re.sub(re_etal, 'et al.',
	m['author_names'], re.IGNORECASE)
	# remove any characters which denote this author group
	# to be editors, just take the
	# author names, and append '(ed.)'
	output_line = output_line[:start] \
	+ "<cds.AUTHincl>" \
	+ tmp_output_line.strip(",:;- [](") \
	+ ed_notation \
	+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL \
	+ add_to_misc \
	+ output_line[end:]

	return output_line


	def sum_2_dictionaries(dicta, dictb):
	"""Given two dictionaries of totals, where each total refers to a key
	in the dictionary, add the totals.
	E.g.: dicta = { 'a' : 3, 'b' : 1 }
	dictb = { 'a' : 1, 'c' : 5 }
	dicta + dictb = { 'a' : 4, 'b' : 1, 'c' : 5 }
	@param dicta: (dictionary)
	@param dictb: (dictionary)
	@return: (dictionary) - the sum of the 2 dictionaries
	"""
	dict_out = dicta.copy()
	for key in dictb.keys():
	if 'key' in dict_out:
	# Add the sum for key in dictb to that of dict_out:
	dict_out[key] += dictb[key]
	else:
	# the key is not in the first dictionary - add it directly:
	dict_out[key] = dictb[key]
	return dict_out


	def identify_ibids(line):
	"""Find IBIDs within the line, record their position and length,
	and replace them with underscores.
	@param line: (string) the working reference line
	@return: (tuple) containing 2 dictionaries and a string:
	Dictionary: matched IBID text: (Key: position of IBID in
	line; Value: matched IBID text)
	String: working line with matched IBIDs removed
	"""
	ibid_match_txt = {}
	# Record details of each matched ibid:
	for m_ibid in re_ibid.finditer(line):
	ibid_match_txt[m_ibid.start()] = m_ibid.group(0)
	# Replace matched text in line with underscores:
	line = line[0:m_ibid.start()] + \
	"_" * len(m_ibid.group(0)) + \
	line[m_ibid.end():]

	return ibid_match_txt, line


	def find_all(string, sub):
	listindex = []
	offset = 0
	i = string.find(sub, offset)
	while i >= 0:
	listindex.append(i)
	i = string.find(sub, i + 1)
	return listindex


	def find_numeration(line):
	"""Given a reference line, attempt to locate instances of citation
	'numeration' in the line.
	@param line: (string) the reference line.
	@return: (string) the reference line after numeration has been checked
	and possibly recognized/marked-up.
	"""
	patterns = (
	# vol,page,year
	re_numeration_vol_page_yr,
	re_numeration_vol_nucphys_page_yr,
	re_numeration_nucphys_vol_page_yr,
	# With sub volume
	re_numeration_vol_subvol_nucphys_yr_page,
	re_numeration_vol_nucphys_yr_subvol_page,
	# vol,year,page
	re_numeration_vol_yr_page,
	re_numeration_nucphys_vol_yr_page,
	re_numeration_vol_nucphys_series_yr_page,
	# vol,page,year
	re_numeration_vol_series_nucphys_page_yr,
	re_numeration_vol_nucphys_series_page_yr,
	# year,vol,page
	re_numeration_yr_vol_page,
	)

	for pattern in patterns:
	match = pattern.match(line)
	if match:
	info = match.groupdict()
	series = info.get('series', None)
	if not series:
	series = extract_series_from_volume(info['vol'])
	if not info['vol_num']:
	info['vol_num'] = info['vol_num_alt']
	if not info['vol_num']:
	info['vol_num'] = info['vol_num_alt2']
	return {'year': info.get('year', None),
	'series': series,
	'volume': info['vol_num'],
	'page': info['page'],
	'len': match.end()}

	return None


	def identify_journals(line, kb_journals):
	"""Attempt to identify all periodical titles in a reference line.
	Titles will be identified, their information (location in line,
	length in line, and non-standardised version) will be recorded,
	and they will be replaced in the working line by underscores.
	@param line: (string) - the working reference line.
	@param periodical_title_search_kb: (dictionary) - contains the
	regexp patterns used to search for a non-standard TITLE in the
	working reference line. Keyed by the TITLE string itself.
	@param periodical_title_search_keys: (list) - contains the non-
	standard periodical TITLEs to be searched for in the line. This
	list of titles has already been ordered and is used to force
	the order of searching.
	@return: (tuple) containing 4 elements:
	+ (dictionary) - the lengths of all titles
	matched at each given index
	within the line.
	+ (dictionary) - the text actually matched for
	each title at each given
	index within the line.
	+ (string) - the working line, with the
	titles removed from it and
	replaced by underscores.
	+ (dictionary) - the totals for each bad-title
	found in the line.
	"""
	periodical_title_search_kb = kb_journals[0]
	periodical_title_search_keys = kb_journals[2]

	title_matches = {} # the text matched at the given line
	# location (i.e. the title itself)
	titles_count = {} # sum totals of each 'bad title found in
	# line.

	# Begin searching:
	for title in periodical_title_search_keys:
	# search for all instances of the current periodical title
	# in the line:
	# for each matched periodical title:
	for title_match in periodical_title_search_kb[title].finditer(line):

	if title not in titles_count:
	# Add this title into the titles_count dictionary:
	titles_count[title] = 1
	else:
	# Add 1 to the count for the given title:
	titles_count[title] += 1

	# record the details of this title match:
	# record the match length:
	title_matches[title_match.start()] = title

	len_to_replace = len(title)

	# replace the matched title text in the line it n * '_',
	# where n is the length of the matched title:
	line = u"".join((line[:title_match.start()],
	u"_" * len_to_replace,
	line[title_match.start() + len_to_replace:]))

	# return recorded information about matched periodical titles,
	# along with the newly changed working line:
	return title_matches, line, titles_count


	def identify_report_numbers(line, kb_reports):
	"""Attempt to identify all preprint report numbers in a reference
	line.
	Report numbers will be identified, their information (location
	in line, length in line, and standardised replacement version)
	will be recorded, and they will be replaced in the working-line
	by underscores.
	@param line: (string) - the working reference line.
	@param preprint_repnum_search_kb: (dictionary) - contains the
	regexp patterns used to identify preprint report numbers.
	@param preprint_repnum_standardised_categs: (dictionary) -
	contains the standardised 'category' of a given preprint report
	number.
	@return: (tuple) - 3 elements:
	* a dictionary containing the lengths in the line of the
	matched preprint report numbers, keyed by the index at
	which each match was found in the line.
	* a dictionary containing the replacement strings (standardised
	versions) of preprint report numbers that were matched in
	the line.
	* a string, that is the new version of the working reference
	line, in which any matched preprint report numbers have been
	replaced by underscores.
	Returned tuple is therefore in the following order:
	(matched-reportnum-lengths, matched-reportnum-replacements,
	working-line)
	"""
	def _by_len(a, b):
	"""Comparison function used to sort a list by the length of the
	strings in each element of the list.
	"""
	if len(a[1]) < len(b[1]):
	return 1
	elif len(a[1]) == len(b[1]):
	return 0
	else:
	return -1

	repnum_matches_matchlen = {} # info about lengths of report numbers
	# matched at given locations in line
	repnum_matches_repl_str = {} # standardised report numbers matched
	# at given locations in line

	preprint_repnum_search_kb, preprint_repnum_standardised_categs = kb_reports
	preprint_repnum_categs = preprint_repnum_standardised_categs.keys()
	preprint_repnum_categs.sort(_by_len)

	# Handle CERN/LHCC/98-013
	line = line.replace('/', ' ')

	# try to match preprint report numbers in the line:
	for categ in preprint_repnum_categs:
	# search for all instances of the current report
	# numbering style in the line:
	repnum_matches_iter = preprint_repnum_search_kb[categ].finditer(line)

	# for each matched report number of this style:
	for repnum_match in repnum_matches_iter:
	# Get the matched text for the numeration part of the
	# preprint report number:
	numeration_match = repnum_match.group('numn')
	# clean/standardise this numeration text:
	numeration_match = numeration_match.replace(" ", "-")
	numeration_match = re_multiple_hyphens.sub("-", numeration_match)
	numeration_match = numeration_match.replace("/-", "/")
	numeration_match = numeration_match.replace("-/", "/")
	numeration_match = numeration_match.replace("-/-", "/")

	# replace the found preprint report number in the
	# string with underscores
	# (this will replace chars in the lower-cased line):
	line = line[0:repnum_match.start(1)] \
	+ "_"*len(repnum_match.group(1)) + line[repnum_match.end(1):]
	+
	# record the information about the matched preprint report number:
	# total length in the line of the matched preprint report number:
	repnum_matches_matchlen[repnum_match.start(1)] = \
	len(repnum_match.group(1))
	# standardised replacement for the matched preprint report number:
	repnum_matches_repl_str[repnum_match.start(1)] = \
	preprint_repnum_standardised_categs[categ] \
	+ numeration_match

	# return recorded information about matched report numbers, along with
	# the newly changed working line:
	return repnum_matches_matchlen, repnum_matches_repl_str, line


	def identify_publishers(line, kb_publishers):
	matches_repl = {} # standardised report numbers matched
	# at given locations in line

	for abbrev, info in kb_publishers.iteritems():
	for match in info['pattern'].finditer(line):
	# record the matched non-standard version of the publisher:
	matches_repl[match.start(0)] = abbrev

	return matches_repl


	def identify_and_tag_URLs(line):
	"""Given a reference line, identify URLs in the line, record the
	information about them, and replace them with a "<cds.URL />" tag.
	URLs are identified in 2 forms:
	+ Raw: http://invenio-software.org/
	+ HTML marked-up: <a href="http://invenio-software.org/">CERN Document
	Server Software Consortium</a>
	These URLs are considered to have 2 components: The URL itself
	(url string); and the URL description. The description is effectively
	the text used for the created Hyperlink when the URL is marked-up
	in HTML. When an HTML marked-up URL has been recognised, the text
	between the anchor tags is therefore taken as the URL description.
	In the case of a raw URL recognition, however, the URL itself will
	also be used as the URL description.
	For example, in the following reference line:
	[1] See <a href="http://invenio-software.org/">CERN Document Server
	Software Consortium</a>.
	...the URL string will be "http://invenio-software.org/" and the URL
	description will be
	"CERN Document Server Software Consortium".
	The line returned from this function will be:
	[1] See <cds.URL />
	In the following line, however:
	[1] See http //invenio-software.org/ for more details.
	...the URL string will be "http://invenio-software.org/" and the URL
	description will also be "http://invenio-software.org/".
	The line returned will be:
	[1] See <cds.URL /> for more details.

	@param line: (string) the reference line in which to search for URLs.
	@return: (tuple) - containing 2 items:
	+ the line after URLs have been recognised and removed;
	+ a list of 2-item tuples where each tuple represents a recognised URL
	and its description:
	[(url, url-description), (url, url-description), ... ]
	@Exceptions raised:
	+ an IndexError if there is a problem with the number of URLs
	recognised (this should not happen.)
	"""
	# Take a copy of the line:
	line_pre_url_check = line
	# Dictionaries to record details of matched URLs:
	found_url_full_matchlen = {}
	found_url_urlstring = {}
	found_url_urldescr = {}

	# List to contain details of all matched URLs:
	identified_urls = []

	# Attempt to identify and tag all HTML-MARKED-UP URLs in the line:
	m_tagged_url_iter = re_html_tagged_url.finditer(line)
	for m_tagged_url in m_tagged_url_iter:
	startposn = m_tagged_url.start() # start position of matched URL
	endposn = m_tagged_url.end() # end position of matched URL
	matchlen = len(m_tagged_url.group(0)) # total length of URL match

	found_url_full_matchlen[startposn] = matchlen
	found_url_urlstring[startposn] = m_tagged_url.group('url')
	found_url_urldescr[startposn] = m_tagged_url.group('desc')
	# temporarily replace the URL match with underscores so that
	# it won't be re-found
	line = line[0:startposn] + u"_"*matchlen + line[endposn:]

	# Attempt to identify and tag all RAW (i.e. not
	# HTML-marked-up) URLs in the line:
	m_raw_url_iter = re_raw_url.finditer(line)
	for m_raw_url in m_raw_url_iter:
	startposn = m_raw_url.start() # start position of matched URL
	endposn = m_raw_url.end() # end position of matched URL
	matchlen = len(m_raw_url.group(0)) # total length of URL match
	matched_url = m_raw_url.group('url')

	if len(matched_url) > 0 and matched_url[-1] in (".", ","):
	# Strip the full-stop or comma from the end of the url:
	matched_url = matched_url[:-1]

	found_url_full_matchlen[startposn] = matchlen
	found_url_urlstring[startposn] = matched_url
	found_url_urldescr[startposn] = matched_url
	# temporarily replace the URL match with underscores
	# so that it won't be re-found
	line = line[0:startposn] + u"_"*matchlen + line[endposn:]

	# Now that all URLs have been identified, insert them
	# back into the line, tagged:
	found_url_positions = found_url_urlstring.keys()
	found_url_positions.sort()
	found_url_positions.reverse()
	for url_position in found_url_positions:
	line = line[0:url_position] + "<cds.URL />" \
	+ line[url_position + found_url_full_matchlen[url_position]:]

	# The line has been rebuilt. Now record the information about the
	# matched URLs:
	found_url_positions = found_url_urlstring.keys()
	found_url_positions.sort()
	for url_position in found_url_positions:
	identified_urls.append((found_url_urlstring[url_position], \
	found_url_urldescr[url_position]))

	# Somehow the number of URLs found doesn't match the number of
	# URLs recorded in "identified_urls". Raise an IndexError.
	msg = """Error: The number of URLs found in the reference line """ \
	"""does not match the number of URLs recorded in the """ \
	"""list of identified URLs!\nLine pre-URL checking: %s\n""" \
	"""Line post-URL checking: %s\n""" \
	% (line_pre_url_check, line)
	assert len(identified_urls) == len(found_url_positions), msg

	# return the line containing the tagged URLs:
	return line, identified_urls


	def identify_and_tag_DOI(line):
	"""takes a single citation line and attempts to locate any DOI references.
	DOI references are recognised in both http (url) format and also the
	standard DOI notation (DOI: ...)
	@param line: (string) the reference line in which to search for DOI's.
	@return: the tagged line and a list of DOI strings (if any)
	"""
	# Used to hold the DOI strings in the citation line
	doi_strings = []

	# Run the DOI pattern on the line, returning the re.match objects
	matched_doi = re_doi.finditer(line)
	# For each match found in the line
	for match in reversed(list(matched_doi)):
	# Store the start and end position
	start = match.start()
	end = match.end()
	# Get the actual DOI string (remove the url part of the doi string)
	doi_phrase = match.group(6)

	# Replace the entire matched doi with a tag
	line = line[0:start] + "<cds.DOI />" + line[end:]
	# Add the single DOI string to the list of DOI strings
	doi_strings.append(doi_phrase)

	doi_strings.reverse()
	return line, doi_strings
	diff --git a/modules/docextract/lib/refextract_task.py b/modules/docextract/lib/refextract_task.py
	index 7c3b5141c..d84620ea5 100644
	--- a/modules/docextract/lib/refextract_task.py
	+++ b/modules/docextract/lib/refextract_task.py
	@@ -1,251 +1,285 @@
	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	Refextract task

	Sends references to parse through bibsched
	"""

	import sys
	+import os
	from datetime import datetime, timedelta
	+from tempfile import mkstemp

	from invenio.bibtask import task_init, task_set_option, \
	task_get_option, write_message
	from invenio.config import CFG_VERSION, \
	CFG_SITE_SECURE_URL, \
	CFG_BIBCATALOG_SYSTEM, \
	- CFG_REFEXTRACT_TICKET_QUEUE
	+ CFG_REFEXTRACT_TICKET_QUEUE, \
	+ CFG_INSPIRE_SITE
	from invenio.dbquery import run_sql
	from invenio.search_engine import perform_request_search
	# Help message is the usage() print out of how to use Refextract
	from invenio.refextract_cli import HELP_MESSAGE, DESCRIPTION
	-from invenio.refextract_api import update_references, \
	+from invenio.refextract_api import extract_references_from_record, \
	FullTextNotAvailable, \
	- RecordHasReferences
	+ check_record_for_refextract
	+from invenio.refextract_config import CFG_REFEXTRACT_FILENAME
	+from invenio.config import CFG_TMPSHAREDDIR
	+from invenio.bibtask import task_low_level_submission
	from invenio.docextract_task import task_run_core_wrapper, \
	split_ids
	from invenio.docextract_utils import setup_loggers
	+from invenio.docextract_record import print_records
	from invenio.bibcatalog_system_rt import BibCatalogSystemRT
	from invenio.bibedit_utils import get_bibrecord
	from invenio.bibrecord import record_get_field_instances, \
	field_get_subfield_values


	def check_options():
	""" Reimplement this method for having the possibility to check options
	before submitting the task, in order for example to provide default
	values. It must return False if there are errors in the options.
	"""
	if not task_get_option('new') \
	and not task_get_option('modified') \
	and not task_get_option('recids') \
	and not task_get_option('collections') \
	and not task_get_option('arxiv'):
	print >>sys.stderr, 'Error: No records specified, you need' \
	' to specify which files to run on'
	return False

	return True


	def cb_parse_option(key, value, opts, args):
	""" Must be defined for bibtask to create a task """
	if args and len(args) > 0:
	# There should be no standalone arguments for any refextract job
	# This will catch args before the job is shipped to Bibsched
	raise StandardError("Error: Unrecognised argument '%s'." % args[0])

	if key in ('-a', '--new'):
	task_set_option('new', True)
	task_set_option('no-overwrite', True)
	elif key in ('-m', '--modified'):
	task_set_option('modified', True)
	task_set_option('no-overwrite', True)
	elif key in ('-i', '--inspire', ):
	task_set_option('inspire', True)
	elif key in ('--kb-reports', ):
	task_set_option('kb-reports', value)
	elif key in ('--kb-journals', ):
	task_set_option('kb-journals', value)
	elif key in ('--kb-journals-re', ):
	task_set_option('kb-journals-re', value)
	elif key in ('--kb-authors', ):
	task_set_option('kb-authors', value)
	elif key in ('--kb-books', ):
	task_set_option('kb-books', value)
	elif key in ('--kb-conferences', ):
	task_set_option('kb-conferences', value)
	elif key in ('--create-ticket', ):
	task_set_option('create-ticket', True)
	elif key in ('--no-overwrite', ):
	task_set_option('no-overwrite', True)
	elif key in ('--arxiv'):
	task_set_option('arxiv', True)
	elif key in ('-c', '--collections'):
	collections = task_get_option('collections')
	if not collections:
	collections = set()
	task_set_option('collections', collections)
	for v in value.split(","):
	collections.update(perform_request_search(c=v))
	elif key in ('-r', '--recids'):
	recids = task_get_option('recids')
	if not recids:
	recids = set()
	task_set_option('recids', recids)
	recids.update(split_ids(value))

	return True


	def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE):
	- write_message('bibcatalog_system %s' % bibcatalog_system, verbose=1)
	- write_message('queue %s' % queue, verbose=1)
	+ write_message('ticket system: %s' % bibcatalog_system.__class__.__name__)
	+ write_message('queue: %s' % queue)
	if bibcatalog_system and queue:
	+ results = bibcatalog_system.ticket_search(None,
	+ recordid=recid,
	+ queue=queue)
	+ if results:
	+ write_message("Ticket #%s found" % results[0])
	+ else:
	+ _create_ticket(recid, bibcatalog_system, queue)
	+

	- subject = "Refs for #%s" % recid
	+def _create_ticket(recid, bibcatalog_system, queue):
	+ subject = "Refs for #%s" % recid

	+ if CFG_INSPIRE_SITE:
	# Add report number in the subjecet
	report_number = ""
	record = get_bibrecord(recid)

	- in_hep = False
	+ in_core = False
	for collection_tag in record_get_field_instances(record, "980"):
	for collection in field_get_subfield_values(collection_tag, 'a'):
	- if collection == 'HEP':
	- in_hep = True
	+ if collection == 'CORE':
	+ in_core = True

	# Only create tickets for HEP
	- if not in_hep:
	+ if not in_core:
	write_message("not in hep", verbose=1)
	return

	# Do not create tickets for old records
	creation_date = run_sql("""SELECT creation_date FROM bibrec
	WHERE id = %s""", [recid])[0][0]
	if creation_date < datetime.now() - timedelta(days=365*2):
	return

	for report_tag in record_get_field_instances(record, "037"):
	for category in field_get_subfield_values(report_tag, 'c'):
	if category.startswith('astro-ph'):
	write_message("astro-ph", verbose=1)
	# We do not curate astro-ph
	return

	for report_number in field_get_subfield_values(report_tag, 'a'):
	subject += " " + report_number
	break

	- text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, \
	- recid)
	- bibcatalog_system.ticket_submit(subject=subject,
	- queue=queue,
	- text=text,
	- recordid=recid)
	+ text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL,
	+ recid)
	+ bibcatalog_system.ticket_submit(subject=subject,
	+ queue=queue,
	+ text=text,
	+ recordid=recid)


	-def task_run_core(recid, bibcatalog_system=None, _arxiv=False):
	+def task_run_core(recid, records, bibcatalog_system=None, _arxiv=False):
	setup_loggers(None, use_bibtask=True)

	if _arxiv:
	overwrite = True
	else:
	overwrite = not task_get_option('no-overwrite')

	try:
	- update_references(recid,
	- overwrite=overwrite)
	+ record = extract_references_from_record(recid)
	msg = "Extracted references for %s" % recid
	+ safe_to_extract = True
	if overwrite:
	write_message("%s (overwrite)" % msg)
	else:
	write_message(msg)
	-
	- # Create a RT ticket if necessary
	- if not _arxiv and task_get_option('new') \
	- or task_get_option('create-ticket'):
	- write_message("Checking if we should create a ticket", verbose=1)
	- create_ticket(recid, bibcatalog_system)
	+ if not check_record_for_refextract(recid):
	+ write_message('Record not safe for re-extraction, skipping')
	+ safe_to_extract = False
	+
	+ if safe_to_extract:
	+ records.append(record)
	+ # Create a RT ticket if necessary
	+ if task_get_option('new') or task_get_option('create-ticket'):
	+ create_ticket(recid, bibcatalog_system)
	except FullTextNotAvailable:
	write_message("No full text available for %s" % recid)
	- except RecordHasReferences:
	- write_message("Record %s has references, skipping" % recid)
	+
	+
	+def submit_bibupload(bibcatalog_system=None, records=None):
	+ if records:
	+ references_xml = print_records(records)
	+
	+ # Save new record to file
	+ temp_fd, temp_path = mkstemp(prefix=CFG_REFEXTRACT_FILENAME,
	+ dir=CFG_TMPSHAREDDIR)
	+ temp_file = os.fdopen(temp_fd, 'w')
	+ temp_file.write(references_xml)
	+ temp_file.close()
	+
	+ # Update record
	+ task_low_level_submission('bibupload', 'refextract', '-c', temp_path)


	def main():
	"""Constructs the refextract bibtask."""
	if CFG_BIBCATALOG_SYSTEM == 'RT':
	bibcatalog_system = BibCatalogSystemRT()
	else:
	bibcatalog_system = None

	- extra_vars = {'bibcatalog_system': bibcatalog_system}
	+ extra_vars = {'bibcatalog_system': bibcatalog_system, 'records': []}
	# Build and submit the task
	task_init(authorization_action='runrefextract',
	authorization_msg="Refextract Task Submission",
	description=DESCRIPTION,
	# get the global help_message variable imported from refextract.py
	help_specific_usage=HELP_MESSAGE + """

	Scheduled (daemon) options:
	-a, --new Run on all newly inserted records.
	-m, --modified Run on all newly modified records.
	-r, --recids Record id for extraction.
	-c, --collections Entire Collection for extraction.
	--arxiv All arxiv modified records within last week

	Special (daemon) options:
	--create-ticket Create a RT ticket for record references

	Examples:
	(run a daemon job)
	refextract -a
	(run on a set of records)
	refextract --recids 1,2 -r 3
	(run on a collection)
	refextract --collections "Reports"
	(run as standalone)
	refextract -o /home/chayward/refs.xml /home/chayward/thesis.pdf

	""",
	version="Invenio v%s" % CFG_VERSION,
	specific_params=("hVv:x:r:c:nai",
	["help",
	"version",
	"verbose=",
	"inspire",
	"kb-journals=",
	"kb-journals-re=",
	"kb-report-numbers=",
	"kb-authors=",
	"kb-books=",
	"recids=",
	"collections=",
	"new",
	"modified",
	"no-overwrite",
	"arxiv",
	"create-ticket"]),
	task_submit_elaborate_specific_parameter_fnc=cb_parse_option,
	task_submit_check_options_fnc=check_options,
	task_run_fnc=task_run_core_wrapper('refextract',
	task_run_core,
	- extra_vars=extra_vars))
	+ extra_vars=extra_vars,
	+ post_process=submit_bibupload))

No OneTemporaryActions

File Metadata

View Options

Event Timeline

No OneTemporary
Actions