Page MenuHomec4science

No OneTemporary

File Metadata

Created
Mon, Nov 25, 09:53
diff --git a/modules/docextract/lib/refextract_config.py b/modules/docextract/lib/refextract_config.py
index b32bcbb89..e636aeceb 100644
--- a/modules/docextract/lib/refextract_config.py
+++ b/modules/docextract/lib/refextract_config.py
@@ -1,127 +1,127 @@
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""RefExtract configuration"""
from invenio.config import CFG_VERSION, CFG_ETCDIR
# pylint: disable=C0301
-CFG_REFEXTRACT_VERSION_NUM = '1.5.35'
+CFG_REFEXTRACT_VERSION_NUM = '1.5.36'
# Version number:
CFG_REFEXTRACT_VERSION = "Invenio/%s refextract/%s" \
% (CFG_VERSION, CFG_REFEXTRACT_VERSION_NUM)
# Module config directory
CFG_CONF_DIR = '%s/docextract' % CFG_ETCDIR
CFG_REFEXTRACT_KBS = {
'journals' : "%s/journal-titles.kb" % CFG_CONF_DIR,
'journals-re' : "%s/journal-titles-re.kb" % CFG_CONF_DIR,
'report-numbers' : "%s/report-numbers.kb" % CFG_CONF_DIR,
'authors' : "%s/authors.kb" % CFG_CONF_DIR,
'collaborations' : "%s/collaborations.kb" % CFG_CONF_DIR,
'books' : "%s/books.kb" % CFG_CONF_DIR,
'conferences' : "%s/conferences.kb" % CFG_CONF_DIR,
'publishers' : "%s/publishers.kb" % CFG_CONF_DIR,
'special-journals': "%s/special-journals.kb" % CFG_CONF_DIR,
}
# Prefix for temp files
CFG_REFEXTRACT_FILENAME = "refextract"
## MARC Fields and subfields used by refextract:
# Reference fields:
CFG_REFEXTRACT_FIELDS = {
'misc': 'm',
'linemarker': 'o',
'doi': 'a',
'reportnumber': 'r',
'journal': 's',
'url': 'u',
'urldesc': 'z',
'author': 'h',
'title': 't',
'isbn': 'i',
'publisher': 'p',
'year': 'y',
'collaboration': 'c',
'recid': '0',
}
CFG_REFEXTRACT_TAG_ID_REFERENCE = "999" # ref field tag
CFG_REFEXTRACT_IND1_REFERENCE = "C" # ref field ind1
CFG_REFEXTRACT_IND2_REFERENCE = "5" # ref field ind2
## refextract statistics fields:
CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS = "999C6" # ref-stats tag
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS = "a" # ref-stats subfield
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME = "t" # ref-stats time subfield
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION = "v" # ref-stats version subfield
## Internal tags are used by refextract to mark-up recognised citation
## information.
CFG_REFEXTRACT_MARKER_OPENING_REPORT_NUM = r"<cds.REPORTNUMBER>"
CFG_REFEXTRACT_MARKER_OPENING_TITLE = r"<cds.JOURNAL>"
CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID = r"<cds.JOURNALibid>"
CFG_REFEXTRACT_MARKER_OPENING_SERIES = r"<cds.SER>"
CFG_REFEXTRACT_MARKER_OPENING_VOLUME = r"<cds.VOL>"
CFG_REFEXTRACT_MARKER_OPENING_YEAR = r"<cds.YR>"
CFG_REFEXTRACT_MARKER_OPENING_PAGE = r"<cds.PG>"
CFG_REFEXTRACT_MARKER_OPENING_QUOTED = r"<cds.QUOTED>"
CFG_REFEXTRACT_MARKER_OPENING_ISBN = r"<cds.ISBN>"
CFG_REFEXTRACT_MARKER_OPENING_PUBLISHER = r"<cds.PUBLISHER>"
CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION = r"<cds.COLLABORATION>"
# These are the "closing tags:
CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM = r"</cds.REPORTNUMBER>"
CFG_REFEXTRACT_MARKER_CLOSING_TITLE = r"</cds.JOURNAL>"
CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID = r"</cds.JOURNALibid>"
CFG_REFEXTRACT_MARKER_CLOSING_SERIES = r"</cds.SER>"
CFG_REFEXTRACT_MARKER_CLOSING_VOLUME = r"</cds.VOL>"
CFG_REFEXTRACT_MARKER_CLOSING_YEAR = r"</cds.YR>"
CFG_REFEXTRACT_MARKER_CLOSING_PAGE = r"</cds.PG>"
CFG_REFEXTRACT_MARKER_CLOSING_QUOTED = r"</cds.QUOTED>"
CFG_REFEXTRACT_MARKER_CLOSING_ISBN = r"</cds.ISBN>"
CFG_REFEXTRACT_MARKER_CLOSING_PUBLISHER = r"</cds.PUBLISHER>"
CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION = r"</cds.COLLABORATION>"
## Of the form '</cds.AUTHxxxx>' only
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND = r"</cds.AUTHstnd>"
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL = r"</cds.AUTHetal>"
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL = r"</cds.AUTHincl>"
## The minimum length of a reference's misc text to be deemed insignificant.
## when comparing misc text with semi-colon defined sub-references.
## Values higher than this value reflect meaningful misc text.
## Hence, upon finding a correct semi-colon, but having current misc text
## length less than this value (without other meaningful reference objects:
## report numbers, titles...) then no split will occur.
## (A higher value will increase splitting strictness. i.e. Fewer splits)
CGF_REFEXTRACT_SEMI_COLON_MISC_TEXT_SENSITIVITY = 60
## The length of misc text between two adjacent authors which is
## deemed as insignificant. As such, when misc text of a length less
## than this value is found, then the latter author group is dumped into misc.
## (A higher value will increase splitting strictness. i.e. Fewer splits)
CGF_REFEXTRACT_ADJACENT_AUTH_MISC_SEPARATION = 10
## Maximum number of lines for a citation before it is considered invalid
CFG_REFEXTRACT_MAX_LINES = 25
diff --git a/modules/docextract/lib/refextract_re.py b/modules/docextract/lib/refextract_re.py
index cbf93220b..7d4efc7b2 100644
--- a/modules/docextract/lib/refextract_re.py
+++ b/modules/docextract/lib/refextract_re.py
@@ -1,841 +1,853 @@
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
import re
from datetime import datetime
# Sep
re_sep = ur"\s*[,\s:-]\s*"
# Sep or no sep
re_sep_opt = ur"\s*[,\s:-]?\s*"
# Pattern for PoS journal
# e.g. 2006
re_pos_year_num = ur'(?:19|20)\d{2}'
re_pos_year = ur'(?P<year>(' \
+ ur'\s' + re_pos_year_num + ur'\s' \
+ ur'|' \
+ ur'\(' + re_pos_year_num + '\)' \
+ ur'))'
# e.g. LAT2007
re_pos_volume = ur'(?P<volume_name>\w{1,10})' + re_sep_opt + ur'(?P<volume_num>(?:19|20)\d{2})'
# e.g. (LAT2007)
re_pos_volume_par = ur'\(' + re_pos_volume + ur'\)'
# e.g. 20
re_pos_page = ur'(?P<page>\d{1,4})'
re_pos_title = ur'POS'
re_pos_patterns = [
re_pos_title + re_sep_opt + re_pos_year + re_sep + re_pos_volume + re_sep + re_pos_page,
re_pos_title + re_sep + re_pos_volume + re_sep_opt + re_pos_year + re_sep_opt + re_pos_page,
re_pos_title + re_sep + re_pos_volume + re_sep + re_pos_page + re_sep_opt + re_pos_year,
re_pos_title + re_sep_opt + re_pos_volume_par + re_sep_opt + re_pos_page,
]
re_opts = re.VERBOSE | re.UNICODE | re.IGNORECASE
def compute_pos_patterns(patterns):
return [re.compile(p, re_opts) for p in patterns]
re_pos = compute_pos_patterns(re_pos_patterns)
# Pattern for arxiv numbers
# arxiv 9910-1234v9 [physics.ins-det]
re_arxiv = re.compile(ur"""
ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
- [\s.-]*(?P<num>\d{4})(?:[\s-]*V(?P<version>\d))?
+ [\s.-]*(?P<num>\d{4})(?!\d)(?:[\s-]*V(?P<version>\d))?
+ \s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
+
+re_arxiv_5digits = re.compile(ur"""
+ ARXIV[\s:-]*(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
+ [\s.-]*(?P<num>\d{5})(?!\d)(?:[\s-]*V(?P<version>\d))?
\s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
# Pattern for arxiv numbers catchup
# arxiv:9910-123 [physics.ins-det]
RE_ARXIV_CATCHUP = re.compile(ur"""
ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
[\s.-]*(?P<num>\d{3})
\s*\[(?P<suffix>[A-Z.-]+)\]""", re.VERBOSE | re.UNICODE | re.IGNORECASE)
# Patterns for ATLAS CONF report numbers
RE_ATLAS_CONF_PRE_2010 = re.compile(
ur'(?<!\w:)ATL(AS)?-CONF-(?P<code>(?:200\d|99)-\d{3})(?![\w\d])')
RE_ATLAS_CONF_POST_2010 = re.compile(
ur'(?<!\w:)ATL(AS)?-CONF-(?P<code>20[1-9]\d-\d{3})(?![\w\d])')
# Pattern for old arxiv numbers
old_arxiv_numbers = ur"[\|/:\s-]?(?P<num>(?:9[1-9]|0[0-7])(?:0[1-9]|1[0-2])\d{3})(?:v\d{1,3})?(?=[^\w\d]|$)"
old_arxiv = {
ur"acc-ph": None,
ur"astro-ph": None,
ur"astro-phy": "astro-ph",
ur"astro-ph\.[a-z]{2}": None,
ur"atom-ph": None,
ur"chao-dyn": None,
ur"chem-ph": None,
ur"cond-mat": None,
ur"cs": None,
ur"cs\.[a-z]{2}": None,
ur"gr-qc": None,
ur"hep-ex": None,
ur"hep-lat": None,
ur"hep-ph": None,
ur"hepph": "hep-ph",
ur"hep-th": None,
ur"hepth": "hep-th",
ur"math": None,
ur"math\.[a-z]{2}": None,
ur"math-ph": None,
ur"nlin": None,
ur"nlin\.[a-z]{2}": None,
ur"nucl-ex": None,
ur"nucl-th": None,
ur"physics": None,
ur"physics\.acc-ph": None,
ur"physics\.ao-ph": None,
ur"physics\.atm-clus": None,
ur"physics\.atom-ph": None,
ur"physics\.bio-ph": None,
ur"physics\.chem-ph": None,
ur"physics\.class-ph": None,
ur"physics\.comp-ph": None,
ur"physics\.data-an": None,
ur"physics\.ed-ph": None,
ur"physics\.flu-dyn": None,
ur"physics\.gen-ph": None,
ur"physics\.geo-ph": None,
ur"physics\.hist-ph": None,
ur"physics\.ins-det": None,
ur"physics\.med-ph": None,
ur"physics\.optics": None,
ur"physics\.plasm-ph": None,
ur"physics\.pop-ph": None,
ur"physics\.soc-ph": None,
ur"physics\.space-ph": None,
ur"plasm-ph": "physics.plasm-ph",
ur"q-bio\.[a-z]{2}": None,
ur"q-fin\.[a-z]{2}": None,
ur"q-alg": None,
ur"quant-ph": None,
ur"quant-phys": "quant-ph",
ur"solv-int": None,
ur"stat\.[a-z]{2}": None,
ur"stat-mech": None,
ur"dg-ga": None,
ur"hap-ph": "hep-ph",
ur"funct-an": None,
ur"quantph": "quant-ph",
ur"stro-ph": "astro-ph",
ur"hepex": "hep-ex",
ur"math-ag": "math.ag",
ur"math-dg": "math.dg",
ur"nuc-th": "nucl-th",
ur"math-ca": "math.ca",
ur"nlin-si": "nlin.si",
ur"quantum-ph": "quant-ph",
ur"ep-ph": "hep-ph",
ur"ep-th": "hep-ph",
ur"ep-ex": "hep-ex",
ur"hept-h": "hep-th",
ur"hepp-h": "hep-ph",
ur"physi-cs": "physics",
ur"asstro-ph": "astro-ph",
ur"hep-lt": "hep-lat",
ur"he-ph": "hep-ph",
ur"het-ph": "hep-ph",
ur"mat-ph": "math.th",
ur"math-th": "math.th",
ur"ucl-th": "nucl-th",
ur"nnucl-th": "nucl-th",
ur"nuclt-th": "nucl-th",
ur"atro-ph": "astro-ph",
ur"qnant-ph": "quant-ph",
ur"astr-ph": "astro-ph",
ur"math-qa": "math.qa",
ur"tro-ph": "astro-ph",
ur"hucl-th": "nucl-th",
ur"math-gt": "math.gt",
ur"math-nt": "math.nt",
ur"math-ct": "math.ct",
ur"math-oa": "math.oa",
ur"math-sg": "math.sg",
ur"math-ap": "math.ap",
ur"quan-ph": "quant-ph",
ur"nlin-cd": "nlin.cd",
ur"math-sp": "math.sp",
ur"atro-ph": "astro-ph",
ur"ast-ph": "astro-ph",
ur"asyro-ph": "astro-ph",
ur"aastro-ph": "astro-ph",
ur"astrop-ph": "astro-ph",
ur"arxivastrop-ph": "astro-ph",
ur"hept-th": "hep-th",
ur"quan-th": "quant-th",
ur"asro-ph": "astro-ph",
ur"castro-ph": "astro-ph",
ur"asaastro-ph": "astro-ph",
ur"hhep-ph": "hep-ph",
ur"hhep-ex": "hep-ex",
ur"alg-geom": None,
ur"nuclth": "nucl-th",
}
def compute_arxiv_re(report_pattern, report_number):
if report_number is None:
report_number = ur"\g<name>"
report_re = re.compile(ur"(?<!<cds\.REPORTNUMBER>)(?<!\w)" \
+ "(?P<name>" + report_pattern + ")" \
+ old_arxiv_numbers, re.U|re.I)
return report_re, report_number
RE_OLD_ARXIV = [compute_arxiv_re(*i) for i in old_arxiv.iteritems()]
-def compute_years():
+def compute_years(start_year=1991):
current_year = datetime.now().year
- return '|'.join(str(y)[2:] for y in xrange(1991, current_year + 1))
+ return '|'.join(str(y)[2:] for y in xrange(start_year, current_year + 1))
arxiv_years = compute_years()
+arxiv_years_5digits = compute_years(2013)
def compute_months():
return '|'.join(str(y).zfill(2) for y in xrange(1, 13))
arxiv_months = compute_months()
re_new_arxiv = re.compile(ur""" # 9910.1234v9 [physics.ins-det]
- (?<!ARXIV:)
+ (?<!ARXIV:)(?<!\d)
+ (?P<year>%(arxiv_years)s)
+ (?P<month>(0[1-9]|1[0-2]))
+ \.(?P<num>\d{4})(?:[\s-]*V(?P<version>\d))?(?!\d)
+ \s*(?P<suffix>\[[A-Z.-]+\])? """ % {'arxiv_years': arxiv_years}, re.VERBOSE | re.UNICODE | re.IGNORECASE)
+
+re_new_arxiv_5digits = re.compile(ur""" # 9910.1234v9 [physics.ins-det]
+ (?<!ARXIV:)(?<!\d)
(?P<year>%(arxiv_years)s)
- (?P<month>%(arxiv_months)s)
- \.(?P<num>\d{4})(?:[\s-]*V(?P<version>\d))?
- \s*(?P<suffix>\[[A-Z.-]+\])? """ % {'arxiv_years': arxiv_years,
- 'arxiv_months': arxiv_months}, re.VERBOSE | re.UNICODE | re.IGNORECASE)
+ (?P<month>(0[1-9]|1[0-2]))
+ \.(?P<num>\d{5})(?:[\s-]*V(?P<version>\d))?(?!\d)
+ \s*(?P<suffix>\[[A-Z.-]+\])? """ % {'arxiv_years': arxiv_years_5digits}, re.VERBOSE | re.UNICODE | re.IGNORECASE)
# Pattern to recognize quoted text:
re_quoted = re.compile(ur'"(?P<title>[^"]+)"', re.UNICODE)
# Pattern to recognise an ISBN for a book:
re_isbn = re.compile(ur"""
(?:ISBN[-– ]*(?:|10|13)|International Standard Book Number)
[:\s]*
(?P<code>[-\-–0-9Xx]{10,25})""", re.VERBOSE | re.UNICODE)
# Pattern to recognise a correct knowledge base line:
re_kb_line = re.compile(ur'^\s*(?P<seek>[^\s].*)\s*---\s*(?P<repl>[^\s].*)\s*$',
re.UNICODE)
# precompile some often-used regexp for speed reasons:
re_regexp_character_class = re.compile(ur'\[[^\]]+\]', re.UNICODE)
re_multiple_hyphens = re.compile(ur'-{2,}', re.UNICODE)
# In certain papers, " bf " appears just before the volume of a
# cited item. It is believed that this is a mistyped TeX command for
# making the volume "bold" in the paper.
# The line may look something like this after numeration has been recognised:
# M. Bauer, B. Stech, M. Wirbel, Z. Phys. bf C : <cds.VOL>34</cds.VOL>
# <cds.YR>(1987)</cds.YR> <cds.PG>103</cds.PG>
# The " bf " stops the title from being correctly linked with its series
# and/or numeration and thus breaks the citation.
# The pattern below is used to identify this situation and remove the
# " bf" component:
re_identify_bf_before_vol = \
re.compile(ur' bf ((\w )?: \<cds\.VOL\>)', \
re.UNICODE)
# Patterns used for creating institutional preprint report-number
# recognition patterns (used by function "institute_num_pattern_to_regex"):
# Recognise any character that isn't a->z, A->Z, 0->9, /, [, ], ' ', '"':
re_report_num_chars_to_escape = \
re.compile(ur'([^\]A-Za-z0-9\/\[ "])', re.UNICODE)
# Replace "hello" with hello:
re_extract_quoted_text = (re.compile(ur'\"([^"]+)\"', re.UNICODE), ur'\g<1>',)
# Replace / [abcd ]/ with /( [abcd])?/ :
re_extract_char_class = (re.compile(ur' \[([^\]]+) \]', re.UNICODE), \
ur'( [\g<1>])?')
# URL recognition:
raw_url_pattern = ur"""
(https?|s?ftp)://(?:[\w\d_.-])+(?::\d{1,5})?
(?:/[\w\d_.?=&%~∼-]+)*/?
"""
# Stand-alone URL (e.g. http://invenio-software.org/ )
re_raw_url = \
re.compile("['\"]?(?P<url>" + raw_url_pattern + ")['\"]?",
re.UNICODE|re.I|re.VERBOSE)
# HTML marked-up URL (e.g. <a href="http://invenio-software.org/">
# CERN Document Server Software Consortium</a> )
re_html_tagged_url = \
re.compile(ur"""
# Opening a tag
<a\s+
# href attribute
href\s*=\s*[\'"]
# href value
(?P<url>""" + raw_url_pattern + ur""")
# href closing quote
['"]\s*>
# Tag content
(?P<desc>[^\<]+)
# Closing a tag
</a>""", re.UNICODE|re.I|re.VERBOSE)
# Numeration recognition pattern - used to identify numeration
# associated with a title when marking the title up into MARC XML:
vol_tag = ur'<cds\.VOL\>(?P<vol>[^<]+)<\/cds\.VOL>'
year_tag = ur'\<cds\.YR\>\((?P<yr>[^<]+)\)\<\/cds\.YR\>'
series_tag = ur'(?P<series>(?:[A-H]|I{1,3}V?|VI{0,3}))?'
page_tag = ur'\<cds\.PG\>(?P<pg>[^<]+)\<\/cds\.PG\>'
re_recognised_numeration_for_title_plus_series = re.compile(
ur'^\s*[\.,]?\s*(?:Ser\.\s*)?' + series_tag + ur'\s*:?\s*' + vol_tag +
u'\s*(?: ' + year_tag + u')?\s*(?: ' + page_tag + u')', re.UNICODE)
# Another numeration pattern. This one is designed to match marked-up
# numeration that is essentially an IBID, but without the word "IBID". E.g.:
# <cds.JOURNAL>J. Phys. A</cds.JOURNAL> : <cds.VOL>31</cds.VOL>
# <cds.YR>(1998)</cds.YR> <cds.PG>2391</cds.PG>; : <cds.VOL>32</cds.VOL>
# <cds.YR>(1999)</cds.YR> <cds.PG>6119</cds.PG>.
re_numeration_no_ibid_txt = \
re.compile(ur"""
^((\s*;\s*|\s+and\s+)(?P<series>(?:[A-H]|I{1,3}V?|VI{0,3}))?\s*:?\s ## Leading ; : or " and :", and a possible series letter
\<cds\.VOL\>(?P<vol>\d+|(?:\d+\-\d+))\<\/cds\.VOL>\s ## Volume
\<cds\.YR\>\((?P<yr>[12]\d{3})\)\<\/cds\.YR\>\s ## year
\<cds\.PG\>(?P<pg>[RL]?\d+[c]?)\<\/cds\.PG\>) ## page
""", re.UNICODE|re.VERBOSE)
re_title_followed_by_series_markup_tags = \
re.compile(ur'(\<cds.JOURNAL(?P<ibid>ibid)?\>([^\<]+)\<\/cds.JOURNAL(?:ibid)?\>\s*.?\s*\<cds\.SER\>([A-H]|(I{1,3}V?|VI{0,3}))\<\/cds\.SER\>)', re.UNICODE)
re_title_followed_by_implied_series = \
re.compile(ur'(\<cds.JOURNAL(?P<ibid>ibid)?\>([^\<]+)\<\/cds.JOURNAL(?:ibid)?\>\s*.?\s*([A-H]|(I{1,3}V?|VI{0,3}))\s+:)', re.UNICODE)
re_punctuation = re.compile(ur'[\.\,\;\'\(\)\-]', re.UNICODE)
# The following pattern is used to recognise "citation items" that have been
# identified in the line, when building a MARC XML representation of the line:
re_tagged_citation = re.compile(ur"""
\<cds\. ## open tag: <cds.
((?:JOURNAL(?P<ibid>ibid)?) ## a JOURNAL tag
|VOL ## or a VOL tag
|YR ## or a YR tag
|PG ## or a PG tag
|REPORTNUMBER ## or a REPORTNUMBER tag
|SER ## or a SER tag
|URL ## or a URL tag
|DOI ## or a DOI tag
|QUOTED ## or a QUOTED tag
|ISBN ## or a ISBN tag
|PUBLISHER ## or a PUBLISHER tag
|COLLABORATION ## or a COLLABORATION tag
|AUTH(stnd|etal|incl)) ## or an AUTH tag
(\s\/)? ## optional /
\> ## closing of tag (>)
""", re.UNICODE|re.VERBOSE)
# is there pre-recognised numeration-tagging within a
# few characters of the start if this part of the line?
re_tagged_numeration_near_line_start = \
re.compile(ur'^.{0,4}?<CDS (VOL|SER)>', re.UNICODE)
re_ibid = re.compile(ur'(-|\b)?IBID(EM)?\.?', re.UNICODE)
re_series_from_numeration = re.compile(ur'^([A-Z])\s*[,\s:-]?\s*\d+', re.UNICODE)
re_series_from_numeration_after_volume = re.compile(ur'^\d+\s*[,\s:-]?\s*([A-Z])', re.UNICODE)
# Obtain the series character from the standardised title text
# Only used when no series letter is obtained from numeration matching
re_series_from_title = re.compile(ur"""
([^\s].*)
(?:[\s\.]+(?:(?P<open_bracket>\()\s*[Ss][Ee][Rr]\.)?
([A-H]|(I{1,3}V?|VI{0,3}))
)?
(?(open_bracket)\s*\))$ ## Only match the ending bracket if the opening bracket was found""", \
re.UNICODE|re.VERBOSE)
re_wash_volume_tag = (
re.compile(ur'<cds\.VOL>(\w) (\d+)</cds\.VOL>'),
ur'<cds.VOL>\g<1>\g<2></cds.VOL>',
)
# Roman Numbers
re_roman_numbers = ur"[XxVvIi]+"
# Possible beginnings of numeration
re_start = ur"\s*[,\s:-]?\s*"
# Title tag
re_title_tag = ur"(?P<title_tag><cds\.JOURNAL>[^<]*<\/cds\.JOURNAL>)"
# Number (within a volume)
re_volume_sub_number = ur'[Nn][oO°]\.?\s*\d{1,6}'
re_volume_sub_number_opt = u'(?:' + re_sep + u'(?P<vol_sub>' + \
re_volume_sub_number + u'))?'
# Volume
re_volume_prefix = ur"(?:[Vv]o?l?\.?|[Nn][oO°]\.?)" # Optional Vol./No.
re_volume_suffix = ur"(?:\s*\(\d{1,2}(?:-\d)?\))?"
re_volume_num = ur"\d+|" + "(?:(?<!\w)" + re_roman_numbers + "(?!\w))"
re_volume_id = ur"(?P<vol>(?:(?:[A-Za-z]\s*[,\s:-]?\s*)?(?P<vol_num>%(volume_num)s))|(?:(?P<vol_num_alt>%(volume_num)s)(?:[A-Za-z]))|(?:(?:[A-Za-z]\s?)?(?P<vol_num_alt2>\d+)\s*\-\s*(?:[A-Za-z]\s?)?\d+))" % {'volume_num': re_volume_num}
re_volume_check = ur"(?<![\/\d])"
re_volume = ur"\b" + u"(?:" + re_volume_prefix + u")?\s*" + re_volume_check + \
re_volume_id + re_volume_suffix
# Month
re_short_month = ur"""(?:(?:
[Jj]an|[Ff]eb|[Mm]ar|[Aa]pr|[Mm]ay|[Jj]un|
[Jj]ul|[Aa]ug|[Ss]ep|[Oo]ct|[Nn]ov|[Dd]ec
)\.?)"""
re_month = ur"""(?:(?:
[Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Mm]ay|[Jj]une|
[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd]ecember
)\.?)"""
# Year
re_year_num = ur"(?:19|20)\d{2}"
re_year_text = u"(?P<year>[A-Za-z]?" + re_year_num + u")(?:[A-Za-z]?)"
re_year = ur"""
\(?
(?:%(short_month)s[,\s]\s*)? # Jul, 1980
(?:%(month)s[,\s]\s*)? # July, 1980
(?<!\d)
%(year)s
(?!\d)
\)?
""" % {
'year': re_year_text,
'short_month': re_short_month,
'month': re_month,
}
# Page
re_page_prefix = ur"[pP]?[p]?\.?\s?" # Starting page num: optional Pp.
re_page_num = ur"[RL]?\w?\d+[cC]?" # pagenum with optional R/L
re_page_sep = ur"\s*-\s*" # optional separator between pagenums
re_page = re_page_prefix + \
u"(?P<page>" + re_page_num + u")(?:" + re_page_sep + \
u"(?P<page_end>" + re_page_num + u"))?"
# Series
re_series = ur"(?P<series>[A-H])"
# Used for allowing 3(1991) without space
re_look_ahead_parentesis = ur"(?=\()"
re_sep_or_parentesis = u'(?:' + re_sep + u'|' + re_look_ahead_parentesis + ')'
re_look_behind_parentesis = ur"(?<=\))"
re_sep_or_after_parentesis = u'(?:' + \
re_sep + u'|' + re_look_behind_parentesis + ')'
# After having processed a line for titles, it may be possible to find more
# numeration with the aid of the recognised titles. The following 2 patterns
# are used for this:
re_correct_numeration_2nd_try_ptn1 = re.compile(
re_year + re_sep + # Year
re_title_tag + # Recognised, tagged title
u'(?P<aftertitle>' +
re_sep +
re_volume + re_sep + # The volume
re_page + # The page
u')', re.UNICODE|re.VERBOSE)
re_correct_numeration_2nd_try_ptn2 = re.compile(
re_year + re_sep +
re_title_tag +
u'(?P<aftertitle>' +
re_sep +
re_volume + re_sep +
re_series + re_sep +
re_page +
u')', re.UNICODE|re.VERBOSE)
re_correct_numeration_2nd_try_ptn3 = re.compile(
re_title_tag +
u'(?P<aftertitle>' +
re_sep + # Recognised, tagged title
re_volume + re_sep + # The volume
re_page + # The page
u')', re.UNICODE|re.VERBOSE)
re_correct_numeration_2nd_try_ptn4 = re.compile(
re_title_tag +
u'(?P<aftertitle>' +
re_sep + # Recognised, tagged title
re_year + ur"\s*[.,\s:]\s*" + # Year
re_volume + re_sep + # The volume
re_page + # The page
u')', re.UNICODE|re.VERBOSE)
## precompile some regexps used to search for and standardize
## numeration patterns in a line for the first time:
## Delete the colon and expressions such as Serie, vol, V. inside the pattern
## <serie : volume> E.g. Replace the string """Series A, Vol 4""" with """A 4"""
re_strip_series_and_volume_labels = (re.compile(
ur'(Serie\s|\bS\.?\s)?([A-H])\s?[:,]\s?(\b[Vv]o?l?\.?|\b[Nn]o\.?)?\s?(\d+)', re.UNICODE),
ur'\g<2> \g<4>')
## This pattern is not compiled, but rather included in
## the other numeration paterns:
re_nucphysb_subtitle = \
ur'(?:[\(\[]\s*(?:[Ff][Ss]|[Pp][Mm])\s*\d{0,4}\s*[\)\]])'
re_nucphysb_subtitle_opt = \
u'(?:' + re_sep + re_nucphysb_subtitle + u')?'
## the 4 main numeration patterns:
## Pattern 1: <vol, page, year>
## <v, p, y>
re_numeration_vol_page_yr = re.compile(
re_start +
re_volume + re_volume_sub_number_opt + re_sep +
re_page + re_sep_or_parentesis +
re_year, re.UNICODE|re.VERBOSE)
## <v, [FS], p, y>
re_numeration_vol_nucphys_page_yr = re.compile(
re_start +
re_volume + re_volume_sub_number_opt + re_sep +
re_nucphysb_subtitle + re_sep +
re_page + re_sep_or_parentesis +
re_year, re.UNICODE|re.VERBOSE)
## <[FS], v, p, y>
re_numeration_nucphys_vol_page_yr = re.compile(
re_start +
re_nucphysb_subtitle + re_sep +
re_volume + re_sep +
re_page + re_sep_or_parentesis +
re_year, re.UNICODE|re.VERBOSE)
## Pattern 2: <vol, year, page>
## <v, y, p>
re_numeration_vol_yr_page = re.compile(
re_start +
re_volume + re_sep_or_parentesis +
re_year + re_sep_or_after_parentesis +
re_page, re.UNICODE|re.VERBOSE)
## <v, sv, [FS]?, y, p>
re_numeration_vol_subvol_nucphys_yr_page = re.compile(
re_start +
re_volume + re_volume_sub_number_opt +
re_nucphysb_subtitle_opt + re_sep_or_parentesis +
re_year + re_sep_or_after_parentesis +
re_page, re.UNICODE|re.VERBOSE)
## <v, [FS]?, y, sv, p>
re_numeration_vol_nucphys_yr_subvol_page = re.compile(
re_start +
re_volume + re_nucphysb_subtitle_opt +
re_sep_or_parentesis +
re_year + re_volume_sub_number_opt + re_sep +
re_page, re.UNICODE|re.VERBOSE)
## <[FS]?, v, y, p>
re_numeration_nucphys_vol_yr_page = re.compile(
re_start +
re_nucphysb_subtitle + re_sep +
re_volume + re_sep_or_parentesis + # The volume (optional "vol"/"no")
re_year + re_sep_or_after_parentesis + # Year
re_page, re.UNICODE|re.VERBOSE)
## Pattern 3: <vol, serie, year, page>
## <v, s, [FS]?, y, p>
# re_numeration_vol_series_nucphys_yr_page = (re.compile(
# re_volume + re_sep +
# re_series + re_sep +
# _sre_non_compiled_pattern_nucphysb_subtitle + re_sep_or_parentesis +
# re_year + re_sep +
# re_page, re.UNICODE|re.VERBOSE), ur' \g<series> : ' \
# ur'<cds.VOL>\g<vol></cds.VOL> ' \
# ur'<cds.YR>(\g<year>)</cds.YR> ' \
# ur'<cds.PG>\g<page></cds.PG> ')
## <v, [FS]?, s, y, p
re_numeration_vol_nucphys_series_yr_page = re.compile(
re_start +
re_volume + re_nucphysb_subtitle_opt + re_sep +
re_series + re_sep_or_parentesis +
re_year + re_sep_or_after_parentesis +
re_page, re.UNICODE|re.VERBOSE)
## Pattern 4: <vol, serie, page, year>
## <v, s, [FS]?, p, y>
re_numeration_vol_series_nucphys_page_yr = re.compile(
re_start +
re_volume + re_sep +
re_series + re_nucphysb_subtitle_opt + re_sep +
re_page + re_sep +
re_year, re.UNICODE|re.VERBOSE)
## <v, [FS]?, s, p, y>
re_numeration_vol_nucphys_series_page_yr = re.compile(
re_start +
re_volume + re_nucphysb_subtitle_opt + re_sep +
re_series + re_sep +
re_page + re_sep +
re_year, re.UNICODE|re.VERBOSE)
## Pattern 5: <year, vol, page>
re_numeration_yr_vol_page = re.compile(
re_start +
re_year + re_sep_or_after_parentesis +
re_volume + re_sep +
re_page, re.UNICODE|re.VERBOSE)
## Pattern used to locate references of a doi inside a citation
## This pattern matches both url (http) and 'doi:' or 'DOI' formats
re_doi = (re.compile(ur"""
((\(?[Dd][Oo][Ii](\s)*\)?:?(\s)*) # 'doi:' or 'doi' or '(doi)' (upper or lower case)
|(https?://dx\.doi\.org\/))? # or 'http://dx.doi.org/' (neither has to be present)
(10\. # 10. (mandatory for DOI's)
\d{4} # [0-9] x4
/ # /
[\w\-_:;\(\)/\.<>]+ # any character
[\w\-_:;\(\)/<>]) # any character excluding a full stop
""", re.VERBOSE))
def _create_regex_pattern_add_optional_spaces_to_word_characters(word):
"""Add the regex special characters (\s*) to allow optional spaces between
the characters in a word.
@param word: (string) the word to be inserted into a regex pattern.
@return: string: the regex pattern for that word with optional spaces
between all of its characters.
"""
new_word = u""
for ch in word:
if ch.isspace():
new_word += ch
else:
new_word += ch + ur'\s*'
return new_word
def get_reference_section_title_patterns():
"""Return a list of compiled regex patterns used to search for the title of
a reference section in a full-text document.
@return: (list) of compiled regex patterns.
"""
patterns = []
titles = [u'references',
u'references.',
u'r\u00C9f\u00E9rences',
u'r\u00C9f\u00C9rences',
u'reference',
u'refs',
u'r\u00E9f\u00E9rence',
u'r\u00C9f\u00C9rence',
u'r\xb4ef\xb4erences',
u'r\u00E9fs',
u'r\u00C9fs',
u'bibliography',
u'bibliographie',
u'citations',
u'literaturverzeichnis']
sect_marker = u'^\s*([\[\-\{\(])?\s*' \
u'((\w|\d){1,5}([\.\-\,](\w|\d){1,5})?\s*' \
u'[\.\-\}\)\]]\s*)?' \
u'(?P<title>'
sect_marker1 = u'^(\d){1,3}\s*(?P<title>'
line_end = ur'(\s*s\s*e\s*c\s*t\s*i\s*o\s*n\s*)?)([\)\}\]])?' \
ur'($|\s*[\[\{\(\<]\s*[1a-z]\s*[\}\)\>\]]|\:$)'
for t in titles:
t_ptn = re.compile(sect_marker + \
_create_regex_pattern_add_optional_spaces_to_word_characters(t) + \
line_end, re.I|re.UNICODE)
patterns.append(t_ptn)
## allow e.g. 'N References' to be found where N is an integer
t_ptn = re.compile(sect_marker1 + \
_create_regex_pattern_add_optional_spaces_to_word_characters(t) + \
line_end, re.I|re.UNICODE)
patterns.append(t_ptn)
return patterns
def get_reference_line_numeration_marker_patterns(prefix=u''):
"""Return a list of compiled regex patterns used to search for the marker
of a reference line in a full-text document.
@param prefix: (string) the possible prefix to a reference line
@return: (list) of compiled regex patterns.
"""
title = u""
if type(prefix) in (str, unicode):
title = prefix
g_name = u'(?P<mark>'
g_close = u')'
space = ur'\s*'
patterns = [
# [1]
space + title + g_name + ur'\[\s*(?P<marknum>\d+)\s*\]' + g_close,
# [<letters and numbers]
space + title + g_name + ur'\[\s*[a-zA-Z:-]+\+?\s?(\d{1,4}[A-Za-z:-]?)?\s*\]' + g_close,
# {1}
space + title + g_name + ur'\{\s*(?P<marknum>\d+)\s*\}' + g_close,
# (1)
space + title + g_name + ur'\<\s*(?P<marknum>\d+)\s*\>' + g_close,
space + title + g_name + ur'\(\s*(?P<marknum>\d+)\s*\)' + g_close,
space + title + g_name + ur'(?P<marknum>\d+)\s*\.(?!\d)' + g_close,
space + title + g_name + ur'(?P<marknum>\d+)\s+' + g_close,
space + title + g_name + ur'(?P<marknum>\d+)\s*\]' + g_close,
# 1]
space + title + g_name + ur'(?P<marknum>\d+)\s*\}' + g_close,
# 1}
space + title + g_name + ur'(?P<marknum>\d+)\s*\)' + g_close,
# 1)
space + title + g_name + ur'(?P<marknum>\d+)\s*\>' + g_close,
# [1.1]
space + title + g_name + ur'\[\s*\d+\.\d+\s*\]' + g_close,
# [ ]
space + title + g_name + ur'\[\s*\]' + g_close,
# *
space + title + g_name + ur'\*' + g_close,
]
return [re.compile(p, re.I|re.UNICODE) for p in patterns]
def get_reference_line_marker_pattern(pattern):
"""Return a list of compiled regex patterns used to search for the first
reference line in a full-text document.
The line is considered to start with either: [1] or {1}
The line is considered to start with : 1. or 2. or 3. etc
The line is considered to start with : 1 or 2 etc (just a number)
@return: (list) of compiled regex patterns.
"""
return re.compile(u'(?P<mark>' + pattern + u')', re.I|re.UNICODE)
re_reference_line_bracket_markers = get_reference_line_marker_pattern(
ur'(?P<left>\[)\s*(?P<marknum>\d+)\s*(?P<right>\])'
)
re_reference_line_curly_bracket_markers = get_reference_line_marker_pattern(
ur'(?P<left>\{)\s*(?P<marknum>\d+)\s*(?P<right>\})'
)
re_reference_line_dot_markers = get_reference_line_marker_pattern(
ur'(?P<left>)\s*(?P<marknum>\d+)\s*(?P<right>\.)'
)
re_reference_line_number_markers = get_reference_line_marker_pattern(
ur'(?P<left>)\s*(?P<marknum>\d+)\s*(?P<right>)'
)
def get_post_reference_section_title_patterns():
"""Return a list of compiled regex patterns used to search for the title
of the section after the reference section in a full-text document.
@return: (list) of compiled regex patterns.
"""
compiled_patterns = []
thead = ur'^\s*([\{\(\<\[]?\s*(\w|\d)\s*[\)\}\>\.\-\]]?\s*)?'
ttail = ur'(\s*\:\s*)?'
numatn = ur'(\d+|\w\b|i{1,3}v?|vi{0,3})[\.\,]{0,2}\b'
roman_numbers = ur'[LVIX]'
patterns = [
# Section titles
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'appendix') + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'appendices') + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'acknowledgement') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'acknowledgment') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'table') + ur'\w?s?\d?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'figure') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'list of figure') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'annex') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'discussion') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'remercie') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'index') + ur's?' + ttail,
thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'summary') + ur's?' + ttail,
# Figure nums
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'figure') + numatn,
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'fig') + ur'\.\s*' + numatn,
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'fig') + ur'\.?\s*\d\w?\b',
# Tables
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'table') + numatn,
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'tab') + ur'\.\s*' + numatn,
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'tab') + ur'\.?\s*\d\w?\b',
# Other titles formats
ur'^\s*' + roman_numbers + ur'\.?\s*[Cc]onclusion[\w\s]*$',
ur'^\s*Appendix\s[A-Z]\s*\:\s*[a-zA-Z]+\s*',
]
for p in patterns:
compiled_patterns.append(re.compile(p, re.I|re.UNICODE))
return compiled_patterns
def get_post_reference_section_keyword_patterns():
"""Return a list of compiled regex patterns used to search for various
keywords that can often be found after, and therefore suggest the end of,
a reference section in a full-text document.
@return: (list) of compiled regex patterns.
"""
compiled_patterns = []
patterns = [u'(' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'prepared') + \
ur'|' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'created') + \
ur').*(AAS\s*)?\sLATEX',
ur'AAS\s+?LATEX\s+?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'macros') + u'v',
ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'This paper has been produced using'),
ur'^\s*' + \
_create_regex_pattern_add_optional_spaces_to_word_characters(u'This article was processed by the author using Springer-Verlag') + \
u' LATEX']
for p in patterns:
compiled_patterns.append(re.compile(p, re.I|re.UNICODE))
return compiled_patterns
def regex_match_list(line, patterns):
"""Given a list of COMPILED regex patters, perform the "re.match" operation
on the line for every pattern.
Break from searching at the first match, returning the match object.
In the case that no patterns match, the None type will be returned.
@param line: (unicode string) to be searched in.
@param patterns: (list) of compiled regex patterns to search "line"
with.
@return: (None or an re.match object), depending upon whether one of
the patterns matched within line or not.
"""
m = None
for ptn in patterns:
m = ptn.match(line)
if m is not None:
break
return m
# The different forms of arXiv notation
re_arxiv_notation = re.compile(ur"""
(arxiv)|(e[\-\s]?print:?\s*arxiv)
""", re.VERBOSE)
# et. al. before J. /// means J is a journal
re_num = re.compile(ur'(\d+)')
diff --git a/modules/docextract/lib/refextract_regression_tests.py b/modules/docextract/lib/refextract_regression_tests.py
index a5f2b6b79..98d7c04f0 100644
--- a/modules/docextract/lib/refextract_regression_tests.py
+++ b/modules/docextract/lib/refextract_regression_tests.py
@@ -1,2853 +1,2853 @@
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
The Refextract regression tests suite
The tests will not modifiy the database.
They are intended to make sure there is no regression in references parsing.
"""
from invenio.testutils import InvenioTestCase
import re
from invenio.testutils import make_test_suite, run_test_suite, InvenioXmlTestCase
from invenio.refextract_engine import parse_references
from invenio.docextract_utils import setup_loggers
from invenio.refextract_text import wash_and_repair_reference_line
from invenio import refextract_kbs
from invenio import refextract_record
def compare_references(test, record, expected_references, ignore_misc=True):
# Remove the statistical datafield from the final extracted references
record['999'] = record.find_fields('999C5')
if ignore_misc:
# We don't care about what's in the misc field
for field in record['999']:
field.subfields = [subfield for subfield in field.subfields
if subfield.code != 'm']
test.assertXmlEqual(record.to_xml(), expected_references.encode('utf-8'))
def _reference_test(test, ref_line, parsed_reference, ignore_misc=True):
#print u'refs: %s' % ref_line
ref_line = wash_and_repair_reference_line(ref_line)
#print u'cleaned: %s' % ref_line
out = parse_references([ref_line], kbs_files={
'journals' : test.kb_journals,
'journals-re' : test.kb_journals_re,
'report-numbers' : test.kb_report_numbers,
'books' : test.kb_books,
})
compare_references(test, out, parsed_reference, ignore_misc=ignore_misc)
class RefextractInvenioTest(InvenioXmlTestCase):
def setUp(self):
self.old_override = refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE
refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE = {}
self.old_inspire = refextract_record.CFG_INSPIRE_SITE
refextract_record.CFG_INSPIRE_SITE = False
setup_loggers(verbosity=0)
self.maxDiff = 2000
self.kb_journals = None
self.kb_journals_re = None
self.kb_report_numbers = None
self.kb_authors = None
self.kb_books = None
self.kb_conferences = None
def tearDown(self):
refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE = self.old_override
refextract_record.CFG_INSPIRE_SITE = self.old_inspire
def test_month_with_year(self):
ref_line = u"""[2] S. Weinberg, A Model of Leptons, Phys. Rev. Lett. 19 (Nov, 1967) 1264–1266."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">2</subfield>
<subfield code="h">S. Weinberg, A Model of Leptons</subfield>
<subfield code="s">Phys. Rev. Lett. 19 (1967) 1264</subfield>
<subfield code="y">1967</subfield>
</datafield>
</record>""")
def test_numeration_not_finding_year(self):
ref_line = u"""[137] M. Papakyriacou, H. Mayer, C. Pypen, H. P. Jr., and S. Stanzl-Tschegg, “Influence of loading frequency on high cycle fatigue properties of b.c.c. and h.c.p. metals,” Materials Science and Engineering, vol. A308, pp. 143–152, 2001."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">137</subfield>
<subfield code="h">M. Papakyriacou, H. Mayer, C. Pypen, H. P. Jr., and S. Stanzl-Tschegg</subfield>
<subfield code="t">Influence of loading frequency on high cycle fatigue properties of b.c.c. and h.c.p. metals</subfield>
<subfield code="s">Mat.Sci.Eng. A308 (2001) 143</subfield>
<subfield code="y">2001</subfield>
</datafield>
</record>""")
def test_numeration_not_finding_year2(self):
"""Bug fix test for numeration not finding year in this citation"""
ref_line = u"""[138] Y.-B. Park, R. Mnig, and C. A. Volkert, “Frequency effect on thermal fatigue damage in Cu interconnects,” Thin Solid Films, vol. 515, pp. 3253– 3258, 2007."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">138</subfield>
<subfield code="h">Y.-B. Park, R. Mnig, and C. A. Volkert</subfield>
<subfield code="t">Frequency effect on thermal fatigue damage in Cu interconnects</subfield>
<subfield code="s">Thin Solid Films 515 (2007) 3253</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_extra_a_in_report_number(self):
ref_line = u'[14] CMS Collaboration, CMS-PAS-HIG-12-002. CMS Collaboration, CMS-PAS-HIG-12-008. CMS Collaboration, CMS-PAS-HIG-12-022. ATLAS Collaboration, arXiv:1205.0701. ATLAS Collaboration, ATLAS-CONF-2012-078.'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="c">CMS Collaboration</subfield>
<subfield code="r">CMS-PAS-HIG-12-002</subfield>
<subfield code="c">CMS Collaboration</subfield>
<subfield code="r">CMS-PAS-HIG-12-008</subfield>
<subfield code="c">CMS Collaboration</subfield>
<subfield code="r">CMS-PAS-HIG-12-022</subfield>
<subfield code="c">ATLAS Collaboration</subfield>
<subfield code="r">arXiv:1205.0701</subfield>
<subfield code="c">ATLAS Collaboration</subfield>
<subfield code="r">ATLAS-CONF-2012-078</subfield>
</datafield>
</record>""")
class RefextractTest(InvenioXmlTestCase):
"""Testing output of refextract"""
def setUp(self):
self.old_inspire = refextract_record.CFG_INSPIRE_SITE
refextract_record.CFG_INSPIRE_SITE = True
self.inspire = True
self.kb_books = [
('Griffiths, David', 'Introduction to elementary particles', '2008')
]
self.kb_journals = [
("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", "Phys.Rev.ST Accel.Beams"),
("PHYS REV D", "Phys.Rev.;D"),
("PHYS REV", "Phys.Rev."),
("PHYS REV LETT", "Phys.Rev.Lett."),
("PHYS LETT", "Phys.Lett."),
("J PHYS", "J.Phys."),
("JOURNAL OF PHYSICS", "J.Phys."),
("J PHYS G", "J.Phys.;G"),
("PHYSICAL REVIEW", "Phys.Rev."),
("ADV THEO MATH PHYS", "Adv.Theor.Math.Phys."),
("MATH PHYS", "Math.Phys."),
("J MATH PHYS", "J.Math.Phys."),
("JHEP", "JHEP"),
("SITZUNGSBER PREUSS AKAD WISS PHYS MATH KL", "Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.)"),
("PHYS LETT", "Phys.Lett."),
("NUCL PHYS", "Nucl.Phys."),
("NUCL PHYS", "Nucl.Phys."),
("NUCL PHYS PROC SUPPL", "Nucl.Phys.Proc.Suppl."),
("JINST", "JINST"),
("THE EUROPEAN PHYSICAL JOURNAL C PARTICLES AND FIELDS", "Eur.Phys.J.;C"),
("COMMUN MATH PHYS", "Commun.Math.Phys."),
("COMM MATH PHYS", "Commun.Math.Phys."),
("REV MOD PHYS", "Rev.Mod.Phys."),
("ANN PHYS U S", "Ann.Phys."),
("AM J PHYS", "Am.J.Phys."),
("PROC R SOC LONDON SER", "Proc.Roy.Soc.Lond."),
("CLASS QUANT GRAVITY", "Class.Quant.Grav."),
("FOUND PHYS", "Found.Phys."),
("IEEE TRANS NUCL SCI", "IEEE Trans.Nucl.Sci."),
("SCIENCE", "Science"),
("ACTA MATERIALIA", "Acta Mater."),
("REVIEWS OF MODERN PHYSICS", "Rev.Mod.Phys."),
("NUCL INSTRUM METHODS", "Nucl.Instrum.Meth."),
("Z PHYS", "Z.Phys."),
("Eur. Phys. J.", "Eur.Phys.J."),
]
self.kb_journals_re = [
"DAN---Dokl.Akad.Nauk Ser.Fiz.",
]
self.kb_report_numbers = [
"#####CERN#####",
"< yy 999>",
"< yyyy 999>",
"ATL PHYS INT---ATL-PHYS-INT",
"#####LHC#####",
"< yy 999>",
"<syyyy 999>",
"< 999>",
"< 9999>",
"CERN LHC PROJECT REPORT---CERN-LHC-Project-Report",
"CLIC NOTE ---CERN-CLIC-Note",
"CERN LHCC ---CERN-LHCC",
"CERN EP ---CERN-EP",
"######ATLANTIS#######",
"< 9999999>",
"CERN EX---CERN-EX",
]
setup_loggers(verbosity=0)
self.maxDiff = 2500
def tearDown(self):
refextract_record.CFG_INSPIRE_SITE = self.old_inspire
def test_year_title_volume_page(self):
ref_line = u"[14] L. Randall and R. Sundrum, (1999) Phys. Rev. Lett. B83 S08004 More text"
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="h">L. Randall and R. Sundrum</subfield>
<subfield code="s">Phys.Rev.Lett.,B83,S08004</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_url1(self):
ref_line = u"""[1] <a href="http://cdsweb.cern.ch/">CERN Document Server</a> J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231, hep-th/9711200; http://cdsweb.cern.ch/ then http://www.itp.ucsb.edu/online/susyc99/discussion/. ; L. Susskind, J. Math. Phys. 36 (1995) 6377, hep-th/9409089; hello world a<a href="http://uk.yahoo.com/">Yahoo!</a>. Fin."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="u">http://cdsweb.cern.ch/</subfield>
<subfield code="z">CERN Document Server</subfield>
<subfield code="h">J. Maldacena</subfield>
<subfield code="s">Adv.Theor.Math.Phys.,2,231</subfield>
<subfield code="r">hep-th/9711200</subfield>
<subfield code="y">1998</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="u">http://cdsweb.cern.ch/</subfield>
<subfield code="u">http://www.itp.ucsb.edu/online/susyc99/discussion/</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">L. Susskind</subfield>
<subfield code="s">J.Math.Phys.,36,6377</subfield>
<subfield code="r">hep-th/9409089</subfield>
<subfield code="y">1995</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="u">http://uk.yahoo.com/</subfield>
<subfield code="z">Yahoo!</subfield>
</datafield>
</record>""")
def test_url2(self):
ref_line = u"""[2] J. Maldacena, Adv. Theor. Math. Phys. 2 (1998) 231; hep-th/9711200. http://cdsweb.cern.ch/"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">2</subfield>
<subfield code="h">J. Maldacena</subfield>
<subfield code="s">Adv.Theor.Math.Phys.,2,231</subfield>
<subfield code="y">1998</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">2</subfield>
<subfield code="r">hep-th/9711200</subfield>
<subfield code="u">http://cdsweb.cern.ch/</subfield>
</datafield>
</record>""")
def test_url3(self):
ref_line = u"3. “pUML Initial Submission to OMG’ s RFP for UML 2.0 Infrastructure”. URL http://www.cs.york.ac.uk/puml/"
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">3</subfield>
<subfield code="t">pUML Initial Submission to OMG\u2019 s RFP for UML 2.0 Infrastructure</subfield>
<subfield code="u">http://www.cs.york.ac.uk/puml/</subfield>
</datafield>
</record>""")
def test_url4(self):
ref_line = u"""[3] S. Gubser, I. Klebanov and A. Polyakov, Phys. Lett. B428 (1998) 105; hep-th/9802109. http://cdsweb.cern.ch/search.py?AGE=hello-world&ln=en"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">3</subfield>
<subfield code="h">S. Gubser, I. Klebanov and A. Polyakov</subfield>
<subfield code="s">Phys.Lett.,B428,105</subfield>
<subfield code="y">1998</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">3</subfield>
<subfield code="r">hep-th/9802109</subfield>
<subfield code="u">http://cdsweb.cern.ch/search.py?AGE=hello-world&amp;ln=en</subfield>
</datafield>
</record>""")
def test_url5(self):
ref_line = u"""[9] H. J. Drescher and Y. Nara, Phys. Rev. C 75, 034905 (2007); MC-KLN 3.46 at http://www.aiu.ac.jp/ynara/mckln/."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">9</subfield>
<subfield code="h">H. J. Drescher and Y. Nara</subfield>
<subfield code="s">Phys.Rev.,C75,034905</subfield>
<subfield code="y">2007</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">9</subfield>
<subfield code="u">http://www.aiu.ac.jp/ynara/mckln/</subfield>
</datafield>
</record>""")
def test_hep(self):
ref_line = u"""[5] O. Aharony, S. Gubser, J. Maldacena, H. Ooguri and Y. Oz, hep-th/9905111."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">5</subfield>
<subfield code="h">O. Aharony, S. Gubser, J. Maldacena, H. Ooguri and Y. Oz</subfield>
<subfield code="r">hep-th/9905111</subfield>
</datafield>
</record>""")
def test_hep2(self):
ref_line = u"""[4] E. Witten, Adv. Theor. Math. Phys. 2 (1998) 253; hep-th/9802150."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="h">E. Witten</subfield>
<subfield code="s">Adv.Theor.Math.Phys.,2,253</subfield>
<subfield code="y">1998</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="r">hep-th/9802150</subfield>
</datafield>
</record>""")
def test_hep3(self):
ref_line = u"""[6] L. Susskind, J. Math. Phys. 36 (1995) 6377; hep-th/9409089."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">L. Susskind</subfield>
<subfield code="s">J.Math.Phys.,36,6377</subfield>
<subfield code="y">1995</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="r">hep-th/9409089</subfield>
</datafield>
</record>""")
def test_hep4(self):
ref_line = u"""[7] L. Susskind and E. Witten, hep-th/9805114."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">7</subfield>
<subfield code="h">L. Susskind and E. Witten</subfield>
<subfield code="r">hep-th/9805114</subfield>
</datafield>
</record>""")
def test_double_hep_no_semi_colon(self):
ref_line = u"""[7] W. Fischler and L. Susskind, hep-th/9806039; N. Kaloper and A. Linde, Phys. Rev. D60 (1999) 105509, hep-th/9904120."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">7</subfield>
<subfield code="h">W. Fischler and L. Susskind</subfield>
<subfield code="r">hep-th/9806039</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">7</subfield>
<subfield code="h">N. Kaloper and A. Linde</subfield>
<subfield code="s">Phys.Rev.,D60,105509</subfield>
<subfield code="r">hep-th/9904120</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_journal_colon_sep(self):
ref_line = u"""[9] R. Bousso, JHEP 9906:028 (1999); hep-th/9906022."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">9</subfield>
<subfield code="h">R. Bousso</subfield>
<subfield code="s">JHEP,9906,028</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">9</subfield>
<subfield code="r">hep-th/9906022</subfield>
</datafield>
</record>""")
def test_book1(self):
"""book with authors and title but no quotes"""
ref_line = u"""[10] R. Penrose and W. Rindler, Spinors and Spacetime, volume 2, chapter 9 (Cambridge University Press, Cambridge, 1986)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">10</subfield>
<subfield code="h">R. Penrose and W. Rindler</subfield>
</datafield>
</record>""")
def test_hep_combined(self):
ref_line = u"""[11] R. Britto-Pacumio, A. Strominger and A. Volovich, JHEP 9911:013 (1999); hep-th/9905210; blah hep-th/9905211; blah hep-ph/9711200"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">11</subfield>
<subfield code="h">R. Britto-Pacumio, A. Strominger and A. Volovich</subfield>
<subfield code="s">JHEP,9911,013</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">11</subfield>
<subfield code="r">hep-th/9905210</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">11</subfield>
<subfield code="r">hep-th/9905211</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">11</subfield>
<subfield code="r">hep-ph/9711200</subfield>
</datafield>
</record>""")
def test_misc5(self):
ref_line = u"""[12] V. Balasubramanian and P. Kraus, Commun. Math. Phys. 208 (1999) 413; hep-th/9902121."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="h">V. Balasubramanian and P. Kraus</subfield>
<subfield code="s">Commun.Math.Phys.,208,413</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="r">hep-th/9902121</subfield>
</datafield>
</record>""")
def test_misc6(self):
ref_line = u"""[13] V. Balasubramanian and P. Kraus, Phys. Rev. Lett. 83 (1999) 3605; hep-th/9903190."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">13</subfield>
<subfield code="h">V. Balasubramanian and P. Kraus</subfield>
<subfield code="s">Phys.Rev.Lett.,83,3605</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">13</subfield>
<subfield code="r">hep-th/9903190</subfield>
</datafield>
</record>""")
def test_hep5(self):
ref_line = u"""[14] P. Kraus, F. Larsen and R. Siebelink, hep-th/9906127."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="h">P. Kraus, F. Larsen and R. Siebelink</subfield>
<subfield code="r">hep-th/9906127</subfield>
</datafield>
</record>""")
def test_report1(self):
ref_line = u"""[15] L. Randall and R. Sundrum, Phys. Rev. Lett. 83 (1999) 4690; hep-th/9906064. this is a test RN of a different type: CERN-LHC-Project-Report-2006. more text."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">L. Randall and R. Sundrum</subfield>
<subfield code="s">Phys.Rev.Lett.,83,4690</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="r">hep-th/9906064</subfield>
<subfield code="r">CERN-LHC-Project-Report-2006</subfield>
</datafield>
</record>""")
def test_hep6(self):
ref_line = u"""[16] S. Gubser, hep-th/9912001."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">16</subfield>
<subfield code="h">S. Gubser</subfield>
<subfield code="r">hep-th/9912001</subfield>
</datafield>
</record>""")
def test_triple_hep(self):
ref_line = u"""[17] H. Verlinde, hep-th/9906182; H. Verlinde, hep-th/9912018; J. de Boer, E. Verlinde and H. Verlinde, hep-th/9912012."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">17</subfield>
<subfield code="h">H. Verlinde</subfield>
<subfield code="r">hep-th/9906182</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">17</subfield>
<subfield code="h">H. Verlinde</subfield>
<subfield code="r">hep-th/9912018</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">17</subfield>
<subfield code="h">J. de Boer, E. Verlinde and H. Verlinde</subfield>
<subfield code="r">hep-th/9912012</subfield>
</datafield>
</record>""")
def test_url_no_tag(self):
ref_line = u"""[18] E. Witten, remarks at ITP Santa Barbara conference, "New dimensions in field theory and string theory": http://www.itp.ucsb.edu/online/susyc99/discussion/."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">18</subfield>
<subfield code="h">E. Witten</subfield>
<subfield code="t">New dimensions in field theory and string theory</subfield>
<subfield code="u">http://www.itp.ucsb.edu/online/susyc99/discussion/</subfield>
</datafield>
</record>""")
def test_journal_simple(self):
ref_line = u"""[19] D. Page and C. Pope, Commun. Math. Phys. 127 (1990) 529."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">19</subfield>
<subfield code="h">D. Page and C. Pope</subfield>
<subfield code="s">Commun.Math.Phys.,127,529</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_unknown_report(self):
ref_line = u"""[20] M. Duff, B. Nilsson and C. Pope, Physics Reports 130 (1986), chapter 9."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">20</subfield>
<subfield code="h">M. Duff, B. Nilsson and C. Pope</subfield>
</datafield>
</record>""")
def test_journal_volume_with_letter(self):
ref_line = u"""[21] D. Page, Phys. Lett. B79 (1978) 235."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">21</subfield>
<subfield code="h">D. Page</subfield>
<subfield code="s">Phys.Lett.,B79,235</subfield>
<subfield code="y">1978</subfield>
</datafield>
</record>""")
def test_journal_with_hep1(self):
ref_line = u"""[22] M. Cassidy and S. Hawking, Phys. Rev. D57 (1998) 2372, hep-th/9709066; S. Hawking, Phys. Rev. D52 (1995) 5681."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">22</subfield>
<subfield code="h">M. Cassidy and S. Hawking</subfield>
<subfield code="s">Phys.Rev.,D57,2372</subfield>
<subfield code="r">hep-th/9709066</subfield>
<subfield code="y">1998</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">22</subfield>
<subfield code="h">S. Hawking</subfield>
<subfield code="s">Phys.Rev.,D52,5681</subfield>
<subfield code="y">1995</subfield>
</datafield>
</record>""")
def test_hep7(self):
ref_line = u"""[23] K. Skenderis and S. Solodukhin, hep-th/9910023."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">23</subfield>
<subfield code="h">K. Skenderis and S. Solodukhin</subfield>
<subfield code="r">hep-th/9910023</subfield>
</datafield>
</record>""")
def test_journal_with_hep2(self):
ref_line = u"""[24] M. Henningson and K. Skenderis, JHEP 9807:023 (1998), hep-th/9806087."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">24</subfield>
<subfield code="h">M. Henningson and K. Skenderis</subfield>
<subfield code="s">JHEP,9807,023</subfield>
<subfield code="r">hep-th/9806087</subfield>
<subfield code="y">1998</subfield>
</datafield>
</record>""")
def test_unknown_book(self):
ref_line = u"""[25] C. Fefferman and C. Graham, "Conformal Invariants", in Elie Cartan et les Mathematiques d'aujourd'hui (Asterisque, 1985) 95."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">25</subfield>
<subfield code="h">C. Fefferman and C. Graham</subfield>
<subfield code="t">Conformal Invariants</subfield>
</datafield>
</record>""")
def test_hep8(self):
ref_line = u"""[27] E. Witten and S.-T. Yau, hep-th/9910245."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">27</subfield>
<subfield code="h">E. Witten and S.-T. Yau</subfield>
<subfield code="r">hep-th/9910245</subfield>
</datafield>
</record>""")
def test_hep9(self):
ref_line = u"""[28] R. Emparan, JHEP 9906:036 (1999); hep-th/9906040."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">28</subfield>
<subfield code="h">R. Emparan</subfield>
<subfield code="s">JHEP,9906,036</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">28</subfield>
<subfield code="r">hep-th/9906040</subfield>
</datafield>
</record>""")
def test_journal_with_hep3(self):
ref_line = u"""[29] A. Chamblin, R. Emparan, C. Johnson and R. Myers, Phys. Rev. D59 (1999) 64010, hep-th/9808177; S. Hawking, C. Hunter and D. Page, Phys. Rev. D59 (1998) 44033, hep-th/9809035."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">29</subfield>
<subfield code="h">A. Chamblin, R. Emparan, C. Johnson and R. Myers</subfield>
<subfield code="s">Phys.Rev.,D59,64010</subfield>
<subfield code="r">hep-th/9808177</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">29</subfield>
<subfield code="h">S. Hawking, C. Hunter and D. Page</subfield>
<subfield code="s">Phys.Rev.,D59,44033</subfield>
<subfield code="r">hep-th/9809035</subfield>
<subfield code="y">1998</subfield>
</datafield>
</record>""")
def test_journal_with_hep4(self):
ref_line = u"""[30] S. Sethi and L. Susskind, Phys. Lett. B400 (1997) 265, hep-th/9702101; T. Banks and N. Seiberg, Nucl. Phys. B497 (1997) 41, hep-th/9702187."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">30</subfield>
<subfield code="h">S. Sethi and L. Susskind</subfield>
<subfield code="s">Phys.Lett.,B400,265</subfield>
<subfield code="r">hep-th/9702101</subfield>
<subfield code="y">1997</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">30</subfield>
<subfield code="h">T. Banks and N. Seiberg</subfield>
<subfield code="s">Nucl.Phys.,B497,41</subfield>
<subfield code="r">hep-th/9702187</subfield>
<subfield code="y">1997</subfield>
</datafield>
</record>""")
def test_misc7(self):
ref_line = u"""[31] R. Emparan, C. Johnson and R. Myers, Phys. Rev. D60 (1999) 104001; hep-th/9903238."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">31</subfield>
<subfield code="h">R. Emparan, C. Johnson and R. Myers</subfield>
<subfield code="s">Phys.Rev.,D60,104001</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">31</subfield>
<subfield code="r">hep-th/9903238</subfield>
</datafield>
</record>""")
def test_misc8(self):
ref_line = u"""[32] S. Hawking, C. Hunter and M. Taylor-Robinson, Phys. Rev. D59 (1999) 064005; hep-th/9811056."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">32</subfield>
<subfield code="h">S. Hawking, C. Hunter and M. Taylor-Robinson</subfield>
<subfield code="s">Phys.Rev.,D59,064005</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">32</subfield>
<subfield code="r">hep-th/9811056</subfield>
</datafield>
</record>""")
def test_misc9(self):
ref_line = u"""[33] J. Dowker, Class. Quant. Grav. 16 (1999) 1937; hep-th/9812202."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">33</subfield>
<subfield code="h">J. Dowker</subfield>
<subfield code="s">Class.Quant.Grav.,16,1937</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">33</subfield>
<subfield code="r">hep-th/9812202</subfield>
</datafield>
</record>""")
def test_journal3(self):
ref_line = u"""[34] J. Brown and J. York, Phys. Rev. D47 (1993) 1407."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">34</subfield>
<subfield code="h">J. Brown and J. York</subfield>
<subfield code="s">Phys.Rev.,D47,1407</subfield>
<subfield code="y">1993</subfield>
</datafield>
</record>""")
def test_misc10(self):
ref_line = u"""[35] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A 546 (1999) 96"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">35</subfield>
<subfield code="h">D. Freedman, S. Mathur, A. Matsuis and L. Rastelli</subfield>
<subfield code="s">Nucl.Phys.,B546,96</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">35</subfield>
<subfield code="r">hep-th/9804058</subfield>
<subfield code="h">D. Freedman, S. Mathur, A. Matsuis and L. Rastelli</subfield>
<subfield code="s">Nucl.Phys.,A546,96</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_misc11(self):
ref_line = u"""[36] D. Freedman, S. Mathur, A. Matsuis and L. Rastelli, Nucl. Phys. B546 (1999) 96; hep-th/9804058. More text, followed by an IBID A"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">36</subfield>
<subfield code="h">D. Freedman, S. Mathur, A. Matsuis and L. Rastelli</subfield>
<subfield code="s">Nucl.Phys.,B546,96</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">36</subfield>
<subfield code="r">hep-th/9804058</subfield>
</datafield>
</record>""")
def test_misc12(self):
ref_line = u"""[37] some misc lkjslkdjlksjflksj [hep-th/0703265] lkjlkjlkjlkj [hep-th/0606096], hep-ph/0002060, some more misc; Nucl. Phys. B546 (1999) 96"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="r">hep-th/0703265</subfield>
<subfield code="0">93</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="r">hep-th/0606096</subfield>
<subfield code="0">92</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="r">hep-ph/0002060</subfield>
<subfield code="0">96</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="s">Nucl.Phys.,B546,96</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_misc13(self):
ref_line = u"""[38] R. Emparan, C. Johnson and R.. Myers, Phys. Rev. D60 (1999) 104001; this is :: .... misc! hep-th/0703265. and some ...,.,.,.,::: more hep-th/0606096"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">38</subfield>
<subfield code="h">R. Emparan, C. Johnson and R.. Myers</subfield>
<subfield code="s">Phys.Rev.,D60,104001</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">38</subfield>
<subfield code="r">hep-th/0703265</subfield>
<subfield code="0">93</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">38</subfield>
<subfield code="r">hep-th/0606096</subfield>
<subfield code="0">92</subfield>
</datafield>
</record>""")
def test_misc14(self):
"""Same as test_misc12 but with unknow report numbers to the system"""
ref_line = u"""[37] some misc lkjslkdjlksjflksj [hep-th/9206059] lkjlkjlkjlkj [hep-th/9206060], hep-ph/9206061, some more misc; Nucl. Phys. B546 (1999) 96"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="r">hep-th/9206059</subfield>
<subfield code="r">hep-th/9206060</subfield>
<subfield code="r">hep-ph/9206061</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="s">Nucl.Phys.,B546,96</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_misc15(self):
"""Same as test_misc13 but with unknow report numbers to the system"""
ref_line = u"""[38] R. Emparan, C. Johnson and R.. Myers, Phys. Rev. D60 (1999) 104001; this is :: .... misc! hep-th/9206059. and some ...,.,.,.,::: more hep-th/9206060"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">38</subfield>
<subfield code="h">R. Emparan, C. Johnson and R.. Myers</subfield>
<subfield code="s">Phys.Rev.,D60,104001</subfield>
<subfield code="y">1999</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">38</subfield>
<subfield code="r">hep-th/9206059</subfield>
<subfield code="r">hep-th/9206060</subfield>
</datafield>
</record>""")
def test_journal_with_hep5(self):
ref_line = u"""[39] A. Ceresole, G. Dall Agata and R. D Auria, JHEP 11(1999) 009, [hep-th/9907216]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">A. Ceresole, G. Dall Agata and R. D Auria</subfield>
<subfield code="s">JHEP,9911,009</subfield>
<subfield code="r">hep-th/9907216</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_journal_with_hep6(self):
ref_line = u"""[40] D.P. Jatkar and S. Randjbar-Daemi, Phys. Lett. B460, 281 (1999) [hep-th/9904187]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">40</subfield>
<subfield code="h">D.P. Jatkar and S. Randjbar-Daemi</subfield>
<subfield code="s">Phys.Lett.,B460,281</subfield>
<subfield code="r">hep-th/9904187</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_journal_with_hep7(self):
ref_line = u"""[41] G. DallAgata, Phys. Lett. B460, (1999) 79, [hep-th/9904198]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">41</subfield>
<subfield code="h">G. DallAgata</subfield>
<subfield code="s">Phys.Lett.,B460,79</subfield>
<subfield code="r">hep-th/9904198</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_journal_year_volume_page(self):
ref_line = u"""[43] Becchi C., Blasi A., Bonneau G., Collina R., Delduc F., Commun. Math. Phys., 1988, 120, 121."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">43</subfield>
<subfield code="h">Becchi C., Blasi A., Bonneau G., Collina R., Delduc F.</subfield>
<subfield code="s">Commun.Math.Phys.,120,121</subfield>
<subfield code="y">1988</subfield>
</datafield>
</record>""")
def test_journal_volume_year_page1(self):
ref_line = u"""[44]: N. Nekrasov, A. Schwarz, Instantons on noncommutative R4 and (2, 0) superconformal six-dimensional theory, Comm. Math. Phys., 198, (1998), 689-703."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">44</subfield>
<subfield code="h">N. Nekrasov, A. Schwarz</subfield>
<subfield code="s">Commun.Math.Phys.,198,689</subfield>
<subfield code="y">1998</subfield>
</datafield>
</record>""")
def test_journal_volume_year_page2(self):
ref_line = u"""[42] S.M. Donaldson, Instantons and Geometric Invariant Theory, Comm. Math. Phys., 93, (1984), 453-460."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">42</subfield>
<subfield code="h">S.M. Donaldson</subfield>
<subfield code="s">Commun.Math.Phys.,93,453</subfield>
<subfield code="y">1984</subfield>
</datafield>
</record>""")
def test_many_references_in_one_line(self):
ref_line = u"""[45] H. J. Bhabha, Rev. Mod. Phys. 17, 200(1945); ibid, 21, 451(1949); S. Weinberg, Phys. Rev. 133, B1318(1964); ibid, 134, 882(1964); D. L. Pursey, Ann. Phys(U. S)32, 157(1965); W. K. Tung, Phys, Rev. Lett. 16, 763(1966); Phys. Rev. 156, 1385(1967); W. J. Hurley, Phys. Rev. Lett. 29, 1475(1972)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">H. J. Bhabha</subfield>
<subfield code="s">Rev.Mod.Phys.,17,200</subfield>
<subfield code="y">1945</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">H. J. Bhabha</subfield>
<subfield code="s">Rev.Mod.Phys.,21,451</subfield>
<subfield code="y">1949</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">S. Weinberg</subfield>
<subfield code="s">Phys.Rev.,133,B1318</subfield>
<subfield code="y">1964</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">S. Weinberg</subfield>
<subfield code="s">Phys.Rev.,134,882</subfield>
<subfield code="y">1964</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">D. L. Pursey</subfield>
<subfield code="s">Ann.Phys.,32,157</subfield>
<subfield code="y">1965</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">W. K. Tung</subfield>
<subfield code="s">Phys.Rev.Lett.,16,763</subfield>
<subfield code="y">1966</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="s">Phys.Rev.,156,1385</subfield>
<subfield code="y">1967</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">45</subfield>
<subfield code="h">W. J. Hurley</subfield>
<subfield code="s">Phys.Rev.Lett.,29,1475</subfield>
<subfield code="y">1972</subfield>
</datafield>
</record>""")
def test_ibid(self):
"""Simple ibid test"""
ref_line = u"""[46] E. Schrodinger, Sitzungsber. Preuss. Akad. Wiss. Phys. Math. Kl. 24, 418(1930); ibid, 3, 1(1931)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.),24,418</subfield>
<subfield code="y">1930</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.),3,1</subfield>
<subfield code="y">1931</subfield>
</datafield>
</record>""")
def test_ibid2(self):
"Series has to be recognized for ibid to work properly"
ref_line = u"""[46] E. Schrodinger, J.Phys. G 24, 418 (1930); ibid, 3, 1(1931)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">J.Phys.,G24,418</subfield>
<subfield code="y">1930</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">J.Phys.,G3,1</subfield>
<subfield code="y">1931</subfield>
</datafield>
</record>""")
def test_ibid3(self):
"Series after volume has to be recognized for ibid to work properly"
ref_line = u"""[46] E. Schrodinger, J.Phys. G 24, 418 (1930); ibid, 3, 1(1931)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">J.Phys.,G24,418</subfield>
<subfield code="y">1930</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">J.Phys.,G3,1</subfield>
<subfield code="y">1931</subfield>
</datafield>
</record>""")
def test_ibid4(self):
"Series has to be recognized for ibid to work properly"
ref_line = u"""[46] E. Schrodinger, J.Phys. G 24, 418 (1930); ibid, A 3, 1(1931)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">J.Phys.,G24,418</subfield>
<subfield code="y">1930</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
<subfield code="s">J.Phys.,A3,1</subfield>
<subfield code="y">1931</subfield>
</datafield>
</record>""")
def test_invalid_ibid(self):
"Ibid with no preceding journals, needs to go to misc text"
ref_line = u"""[46] E. Schrodinger, ibid, 3, 1(1931)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">46</subfield>
<subfield code="h">E. Schrodinger</subfield>
</datafield>
</record>""")
def test_misc4(self):
ref_line = u"""[47] P. A. M. Dirac, Proc. R. Soc. London, Ser. A155, 447(1936); ibid, D24, 3333(1981)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">47</subfield>
<subfield code="h">P. A. M. Dirac</subfield>
<subfield code="s">Proc.Roy.Soc.Lond.,A155,447</subfield>
<subfield code="y">1936</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">47</subfield>
<subfield code="h">P. A. M. Dirac</subfield>
<subfield code="s">Proc.Roy.Soc.Lond.,D24,3333</subfield>
<subfield code="y">1981</subfield>
</datafield>
</record>""")
def test_doi(self):
ref_line = u"""[48] O.O. Vaneeva, R.O. Popovych and C. Sophocleous, Enhanced Group Analysis and Exact Solutions of Vari-able Coefficient Semilinear Diffusion Equations with a Power Source, Acta Appl. Math., doi:10.1007/s10440-008-9280-9, 46 p., arXiv:0708.3457."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">48</subfield>
<subfield code="h">O.O. Vaneeva, R.O. Popovych and C. Sophocleous</subfield>
<subfield code="a">10.1007/s10440-008-9280-9</subfield>
<subfield code="r">arXiv:0708.3457</subfield>
</datafield>
</record>""")
def test_doi2(self):
ref_line = u"""[1] http://dx.doi.org/10.1175/1520-0442(2000)013<2671:TAORTT>2.0.CO;2"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="a">10.1175/1520-0442(2000)013&lt;2671:TAORTT&gt;2.0.CO;2</subfield>
</datafield>
</record>""")
def test_misc3(self):
ref_line = u"""[49] M. I. Trofimov, N. De Filippis and E. A. Smolenskii. Application of the electronegativity indices of organic molecules to tasks of chemical informatics. Russ. Chem. Bull., 54:2235-2246, 2005. http://dx.doi.org/10.1007/s11172-006-0105-6."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">49</subfield>
<subfield code="h">M. I. Trofimov, N. De Filippis and E. A. Smolenskii</subfield>
<subfield code="a">10.1007/s11172-006-0105-6</subfield>
</datafield>
</record>""")
def test_misc2(self):
ref_line = u"""[50] M. Gell-Mann, P. Ramon ans R. Slansky, in Supergravity, P. van Niewenhuizen and D. Freedman (North-Holland 1979); T. Yanagida, in Proceedings of the Workshop on the Unified Thoery and the Baryon Number in teh Universe, ed. O. Sawaga and A. Sugamoto (Tsukuba 1979); R.N. Mohapatra and G. Senjanovic, Phys. Rev. Lett. 44, 912, (1980)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">50</subfield>
<subfield code="h">M. Gell-Mann, P. Ramon ans R. Slansky</subfield>
<subfield code="p">North-Holland</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">50</subfield>
<subfield code="h">T. Yanagida</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">50</subfield>
<subfield code="h">R.N. Mohapatra and G. Senjanovic</subfield>
<subfield code="s">Phys.Rev.Lett.,44,912</subfield>
<subfield code="y">1980</subfield>
</datafield>
</record>""")
def test_misc1(self):
ref_line = u"""[51] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., eds. Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990);"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">51</subfield>
<subfield code="h">L.S. Durkin and P. Langacker</subfield>
<subfield code="s">Phys.Lett.,B166,436</subfield>
<subfield code="y">1986</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">51</subfield>
<subfield code="h">Amaldi et al.</subfield>
<subfield code="s">Phys.Rev.,D36,1385</subfield>
<subfield code="y">1987</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">51</subfield>
<subfield code="h">(Hayward and Yellow et al. (eds.))</subfield>
<subfield code="s">Phys.Lett.,B245,669</subfield>
<subfield code="y">1990</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">51</subfield>
<subfield code="s">Nucl.Phys.,B342,15</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_combination_of_authors_names(self):
"""authors names in varied formats"""
ref_line = u"""[53] Hush, D.R., R.Leighton, and B.G. Horne, 1993. "Progress in supervised Neural Netw. What's new since Lippmann?" IEEE Signal Process. Magazine 10, 8-39"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">53</subfield>
<subfield code="h">Hush, D.R., R.Leighton, and B.G. Horne</subfield>
<subfield code="t">Progress in supervised Neural Netw. What's new since Lippmann?</subfield>
<subfield code="p">IEEE</subfield>
</datafield>
</record>""")
def test_two_initials_no_space(self):
ref_line = u"""[54] T.G. Rizzo, Phys. Rev. D40, 3035 (1989)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">54</subfield>
<subfield code="h">T.G. Rizzo</subfield>
<subfield code="s">Phys.Rev.,D40,3035</subfield>
<subfield code="y">1989</subfield>
</datafield>
</record>""")
def test_surname_prefix_van(self):
"""An author with prefix + surname
e.g. van Niewenhuizen"""
ref_line = u"""[55] Hawking S., P. van Niewenhuizen, L.S. Durkin, D. Freeman, some title of some journal"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">55</subfield>
<subfield code="h">Hawking S., P. van Niewenhuizen, L.S. Durkin, D. Freeman</subfield>
</datafield>
</record>""")
def test_authors_coma_but_no_journal(self):
"""2 authors separated by coma"""
ref_line = u"""[56] Hawking S., D. Freeman, some title of some journal"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">56</subfield>
<subfield code="h">Hawking S., D. Freeman</subfield>
</datafield>
</record>""")
def test_authors_and_but_no_journal(self):
"""2 authors separated by "and" """
ref_line = u"""[57] Hawking S. and D. Freeman, another random title of some random journal"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">57</subfield>
<subfield code="h">Hawking S. and D. Freeman</subfield>
</datafield>
</record>""")
def test_simple_et_al(self):
"""author ending with et al."""
ref_line = u"""[1] Amaldi et al., Phys. Rev. D36, 1385 (1987)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">Amaldi et al.</subfield>
<subfield code="s">Phys.Rev.,D36,1385</subfield>
<subfield code="y">1987</subfield>
</datafield>
</record>""")
def test_ibid_two_journals(self):
"""IBIDEM test
ibidem must copy the previous reference journal and not
the first one
"""
ref_line = u"""[58] Nucl. Phys. B342, 15 (1990); Phys. Lett. B261, 146 (1991); ibidem B263, 459 (1991);"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">58</subfield>
<subfield code="s">Nucl.Phys.,B342,15</subfield>
<subfield code="y">1990</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">58</subfield>
<subfield code="s">Phys.Lett.,B261,146</subfield>
<subfield code="y">1991</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">58</subfield>
<subfield code="s">Phys.Lett.,B263,459</subfield>
<subfield code="y">1991</subfield>
</datafield>
</record>""")
def test_collaboration(self):
"""collaboration"""
ref_line = u"""[60] HERMES Collaboration, Airapetian A et al. 2005 Phys. Rev. D 71 012003 1-36"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">60</subfield>
<subfield code="c">HERMES Collaboration</subfield>
<subfield code="h">Airapetian A et al.</subfield>
<subfield code="s">Phys.Rev.,D71,012003</subfield>
<subfield code="y">2005</subfield>
</datafield>
</record>""")
def test_weird_number_after_volume(self):
ref_line = u"""[61] de Florian D, Sassot R and Stratmann M 2007 Phys. Rev. D 75 114010 1-26"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">61</subfield>
<subfield code="h">de Florian D, Sassot R and Stratmann M</subfield>
<subfield code="s">Phys.Rev.,D75,114010</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_year_before_journal(self):
ref_line = u"""[64] Bourrely C, Soffer J and Buccella F 2002 Eur. Phys. J. C 23 487-501"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">64</subfield>
<subfield code="h">Bourrely C, Soffer J and Buccella F</subfield>
<subfield code="s">Eur.Phys.J.,C23,487</subfield>
<subfield code="y">2002</subfield>
</datafield>
</record>""")
def test_non_recognized_reference(self):
ref_line = u"""[63] Z. Guzik and R. Jacobsson, LHCb Readout Supervisor ’ODIN’ with a L1\nTrigger - Technical reference, Aug 2005, EDMS 704078-V1.0"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">63</subfield>
<subfield code="h">Z. Guzik and R. Jacobsson</subfield>
</datafield>
</record>""")
def test_year_stuck_to_volume(self):
ref_line = u"""[65] K. Huang, Am. J. Phys. 20, 479(1952)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">65</subfield>
<subfield code="h">K. Huang</subfield>
<subfield code="s">Am.J.Phys.,20,479</subfield>
<subfield code="y">1952</subfield>
</datafield>
</record>""")
def test_two_initials_after_surname(self):
"""Author with 2 initials
e.g. Pate S. F."""
ref_line = u"""[62] Pate S. F., McKee D. W. and Papavassiliou V. 2008 Phys.Rev. C 78 448"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">62</subfield>
<subfield code="h">Pate S. F., McKee D. W. and Papavassiliou V.</subfield>
<subfield code="s">Phys.Rev.,C78,448</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_one_initial_after_surname(self):
"""Author with 1 initials
e.g. Pate S."""
ref_line = u"""[62] Pate S., McKee D., 2008 Phys.Rev. C 78 448"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">62</subfield>
<subfield code="h">Pate S., McKee D.</subfield>
<subfield code="s">Phys.Rev.,C78,448</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_two_initials_no_dot_after_surname(self):
"""Author with 2 initials
e.g. Pate S F"""
ref_line = u"""[62] Pate S F, McKee D W and Papavassiliou V 2008 Phys.Rev. C 78 448"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">62</subfield>
<subfield code="h">Pate S F, McKee D W and Papavassiliou V</subfield>
<subfield code="s">Phys.Rev.,C78,448</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_one_initial_no_dot_after_surname(self):
"""Author with 1 initials
e.g. Pate S"""
ref_line = u"""[62] Pate S, McKee D, 2008 Phys.Rev. C 78 448"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">62</subfield>
<subfield code="h">Pate S, McKee D</subfield>
<subfield code="s">Phys.Rev.,C78,448</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_two_initials_before_surname(self):
ref_line = u"""[67] G. A. Perkins, Found. Phys. 6, 237(1976)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">67</subfield>
<subfield code="h">G. A. Perkins</subfield>
<subfield code="s">Found.Phys.,6,237</subfield>
<subfield code="y">1976</subfield>
</datafield>
</record>""")
def test_one_initial_before_surname(self):
ref_line = u"""[67] G. Perkins, Found. Phys. 6, 237(1976)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">67</subfield>
<subfield code="h">G. Perkins</subfield>
<subfield code="s">Found.Phys.,6,237</subfield>
<subfield code="y">1976</subfield>
</datafield>
</record>""")
def test_two_initials_no_dot_before_surname(self):
ref_line = u"""[67] G A Perkins, Found. Phys. 6, 237(1976)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">67</subfield>
<subfield code="h">G A Perkins</subfield>
<subfield code="s">Found.Phys.,6,237</subfield>
<subfield code="y">1976</subfield>
</datafield>
</record>""")
def test_one_initial_no_dot_before_surname(self):
ref_line = u"""[67] G Perkins, Found. Phys. 6, 237(1976)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">67</subfield>
<subfield code="h">G Perkins</subfield>
<subfield code="s">Found.Phys.,6,237</subfield>
<subfield code="y">1976</subfield>
</datafield>
</record>""")
def test_ibid_twice(self):
ref_line = u"""[68] A. O. Barut et al, Phys. Rev. D23, 2454(1981); ibid, D24, 3333(1981); ibid, D31, 1386(1985)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">68</subfield>
<subfield code="h">A. O. Barut et al.</subfield>
<subfield code="s">Phys.Rev.,D23,2454</subfield>
<subfield code="y">1981</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">68</subfield>
<subfield code="h">A. O. Barut et al.</subfield>
<subfield code="s">Phys.Rev.,D24,3333</subfield>
<subfield code="y">1981</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">68</subfield>
<subfield code="h">A. O. Barut et al.</subfield>
<subfield code="s">Phys.Rev.,D31,1386</subfield>
<subfield code="y">1985</subfield>
</datafield>
</record>""")
def test_no_authors(self):
ref_line = u"""[69] Phys. Rev. Lett. 52, 2009(1984)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">69</subfield>
<subfield code="s">Phys.Rev.Lett.,52,2009</subfield>
<subfield code="y">1984</subfield>
</datafield>
</record>""")
def test_extra_01(self):
"Parsed erroniously as Phys.Rev.Lett.,101,01"
ref_line = u"""[17] de Florian D, Sassot R, Stratmann M and Vogelsang W 2008 Phys. Rev. Lett. 101 072001 1-4; 2009 Phys.
Rev. D 80 034030 1-25"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">17</subfield>
<subfield code="h">de Florian D, Sassot R, Stratmann M and Vogelsang W</subfield>
<subfield code="s">Phys.Rev.Lett.,101,072001</subfield>
<subfield code="y">2008</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">17</subfield>
<subfield code="s">Phys.Rev.,D80,034030</subfield>
<subfield code="y">2009</subfield>
</datafield>
</record>""")
def test_extra_no_after_vol(self):
ref_line = u"""[130] A. Kuper, H. Letaw, L. Slifkin, E-Sonder, and C. T. Tomizuka, “Self- diffusion in copper,” Physical Review, vol. 96, no. 5, pp. 1224–1225, 1954."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">130</subfield>
<subfield code="h">A. Kuper, H. Letaw, L. Slifkin, E-Sonder, and C. T. Tomizuka</subfield>
<subfield code="t">Self- diffusion in copper</subfield>
<subfield code="s">Phys.Rev.,96,1224</subfield>
<subfield code="y">1954</subfield>
</datafield>
</record>""")
def test_jinst(self):
ref_line = u"""[1] ATLAS Collaboration, G. Aad et al., The ATLAS Experiment at the CERN Large Hadron Collider, JINST 3 (2008) S08003."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="c">ATLAS Collaboration</subfield>
<subfield code="h">G. Aad et al.</subfield>
<subfield code="s">JINST,3,S08003</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_collaboration2(self):
ref_line = u"""[28] Particle Data Group Collaboration, K. Nakamura et al., Review of particle physics, J. Phys. G37 (2010) 075021."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">28</subfield>
<subfield code="c">Particle Data Group Collaboration</subfield>
<subfield code="h">K. Nakamura et al.</subfield>
<subfield code="s">J.Phys.,G37,075021</subfield>
<subfield code="y">2010</subfield>
</datafield>
</record>""")
def test_sub_volume(self):
ref_line = u"""[8] S. Horvat, D. Khartchenko, O. Kortner, S. Kotov, H. Kroha, A. Manz, S. Mohrdieck-Mock, K. Nikolaev, R. Richter, W. Stiller, C. Valderanis, J. Dubbert, F. Rauscher, and A. Staude, Operation of the ATLAS muon drift-tube chambers at high background rates and in magnetic fields, IEEE Trans. Nucl. Sci. 53 (2006) no. 2, 562–566"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">8</subfield>
<subfield code="h">S. Horvat, D. Khartchenko, O. Kortner, S. Kotov, H. Kroha, A. Manz, S. Mohrdieck-Mock, K. Nikolaev, R. Richter, W. Stiller, C. Valderanis, J. Dubbert, F. Rauscher, and A. Staude</subfield>
<subfield code="s">IEEE Trans.Nucl.Sci.,53,562</subfield>
<subfield code="y">2006</subfield>
</datafield>
</record>""")
def test_journal_not_recognized(self):
ref_line = u"""[33] A. Moraes, C. Buttar, and I. Dawson, Prediction for minimum bias and the underlying event at LHC energies, The European Physical Journal C - Particles and Fields 50 (2007) 435–466."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">33</subfield>
<subfield code="h">A. Moraes, C. Buttar, and I. Dawson</subfield>
<subfield code="s">Eur.Phys.J.,C50,435</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_multiple_eds(self):
ref_line = u"""[7] L. Evans, (ed.) and P. Bryant, (ed.), LHC Machine, JINST 3 (2008) S08001."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">7</subfield>
<subfield code="h">L. Evans, (ed.) and P. Bryant, (ed.)</subfield>
<subfield code="s">JINST,3,S08001</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_atlas_conf(self):
"""not recognizing preprint format"""
ref_line = u"""[32] The ATLAS Collaboration, Charged particle multiplicities in pp interactions at √s = 0.9 and 7 TeV in a diffractive limited phase space measured with the ATLAS detector at the LHC and a new pythia6 tune, 2010. http://cdsweb.cern.ch/record/1266235/files/ ATLAS-COM-CONF-2010-031.pdf. ATLAS-CONF-2010-031."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">32</subfield>
<subfield code="c">ATLAS Collaboration</subfield>
<subfield code="u">http://cdsweb.cern.ch/record/1266235/files/ATLAS-COM-CONF-2010-031.pdf</subfield>
<subfield code="r">ATLAS-CONF-2010-031</subfield>
</datafield>
</record>""")
def test_journal_of_physics(self):
"""eventually not recognizing the journal, the collaboration or authors"""
ref_line = u"""[19] ATLAS Inner Detector software group Collaboration, T. Cornelissen, M. Elsing, I. Gavilenko, W. Liebig, E. Moyse, and A. Salzburger, The new ATLAS Track Reconstruction (NEWT), Journal of Physics 119 (2008) 032014."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">19</subfield>
<subfield code="c">ATLAS Inner Detector software group Collaboration</subfield>
<subfield code="h">T. Cornelissen, M. Elsing, I. Gavilenko, W. Liebig, E. Moyse, and A. Salzburger</subfield>
<subfield code="s">J.Phys.,119,032014</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_jhep(self):
"""was splitting JHEP in JHE: P"""
ref_line = u"""[22] G. P. Salam and G. Soyez, A practical seedless infrared-safe cone jet algorithm, JHEP 05 (2007) 086."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">22</subfield>
<subfield code="h">G. P. Salam and G. Soyez</subfield>
<subfield code="s">JHEP,0705,086</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_journal_not_recognized2(self):
ref_line = u"""[3] Physics Performance Report Vol 1 – J. Phys. G. Vol 30 N° 11 (2004) 232"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">3</subfield>
<subfield code="s">J.Phys.,G30,232</subfield>
<subfield code="y">2004</subfield>
</datafield>
</record>""")
def test_journal_not_recognized3(self):
ref_line = u"""[3] Physics Performance Report Vol 1 – J. Phys. G. N° 30 (2004) 232"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">3</subfield>
<subfield code="s">J.Phys.,G30,232</subfield>
<subfield code="y">2004</subfield>
</datafield>
</record>""")
def test_journal_not_recognized4(self):
ref_line = u"""[128] D. P. Pritzkau and R. H. Siemann, “Experimental study of rf pulsed heat- ing on oxygen free electronic copper,” Physical Review Special Topics - Accelerators and Beams, vol. 5, pp. 1–22, 2002."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">128</subfield>
<subfield code="h">D. P. Pritzkau and R. H. Siemann</subfield>
<subfield code="t">Experimental study of rf pulsed heat- ing on oxygen free electronic copper</subfield>
<subfield code="s">Phys.Rev.ST Accel.Beams,5,1</subfield>
<subfield code="y">2002</subfield>
</datafield>
</record>""")
def test_journal_not_recognized5(self):
ref_line = u"""[128] D. P. Pritzkau and R. H. Siemann, Phys.Lett. 100B (1981), 117"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">128</subfield>
<subfield code="h">D. P. Pritzkau and R. H. Siemann</subfield>
<subfield code="s">Phys.Lett.,B100,117</subfield>
<subfield code="y">1981</subfield>
</datafield>
</record>""")
def test_note_format1(self):
ref_line = u"""[91] S. Calatroni, H. Neupert, and M. Taborelli, “Fatigue testing of materials by UV pulsed laser irradiation,” CLIC Note 615, CERN, 2004."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">91</subfield>
<subfield code="h">S. Calatroni, H. Neupert, and M. Taborelli</subfield>
<subfield code="t">Fatigue testing of materials by UV pulsed laser irradiation</subfield>
<subfield code="r">CERN-CLIC-Note-615</subfield>
</datafield>
</record>""")
def test_note_format2(self):
ref_line = u"""[5] H. Braun, R. Corsini, J. P. Delahaye, A. de Roeck, S. Dbert, A. Ferrari, G. Geschonke, A. Grudiev, C. Hauviller, B. Jeanneret, E. Jensen, T. Lefvre, Y. Papaphilippou, G. Riddone, L. Rinolfi, W. D. Schlatter, H. Schmickler, D. Schulte, I. Syratchev, M. Taborelli, F. Tecker, R. Toms, S. Weisz, and W. Wuensch, “CLIC 2008 parameters,” tech. rep., CERN CLIC-Note-764, Oct 2008."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">5</subfield>
<subfield code="h">H. Braun, R. Corsini, J. P. Delahaye, A. de Roeck, S. Dbert, A. Ferrari, G. Geschonke, A. Grudiev, C. Hauviller, B. Jeanneret, E. Jensen, T. Lefvre, Y. Papaphilippou, G. Riddone, L. Rinolfi, W. D. Schlatter, H. Schmickler, D. Schulte, I. Syratchev, M. Taborelli, F. Tecker, R. Toms, S. Weisz, and W. Wuensch</subfield>
<subfield code="t">CLIC 2008 parameters</subfield>
<subfield code="r">CERN-CLIC-Note-764</subfield>
</datafield>
</record>""")
def test_remove_empty_misc_tag(self):
ref_line = u"""[21] “http://www.linearcollider.org/.”"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">21</subfield>
<subfield code="u">http://www.linearcollider.org/</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_sub_volume_not_recognized(self):
ref_line = u"""[37] L. Lu, Y. Shen, X. Chen, L. Qian, and K. Lu, “Ultrahigh strength and high electrical conductivity in copper,” Science, vol. 304, no. 5669, pp. 422–426, 2004."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">37</subfield>
<subfield code="h">L. Lu, Y. Shen, X. Chen, L. Qian, and K. Lu</subfield>
<subfield code="t">Ultrahigh strength and high electrical conductivity in copper</subfield>
<subfield code="s">Science,304,422</subfield>
<subfield code="y">2004</subfield>
</datafield>
</record>""")
def test_extra_a_after_journal(self):
ref_line = u"""[28] Particle Data Group Collaboration, K. Nakamura et al., Review of particle physics, J. Phys. G37 (2010) 075021."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">28</subfield>
<subfield code="c">Particle Data Group Collaboration</subfield>
<subfield code="h">K. Nakamura et al.</subfield>
<subfield code="s">J.Phys.,G37,075021</subfield>
<subfield code="y">2010</subfield>
</datafield>
</record>""")
def test_full_month_with_volume(self):
ref_line = u"""[2] C. Rubbia, Experimental observation of the intermediate vector bosons W+, W−, and Z0, Reviews of Modern Physics 57 (July, 1985) 699–722."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">2</subfield>
<subfield code="h">C. Rubbia</subfield>
<subfield code="s">Rev.Mod.Phys.,57,699</subfield>
<subfield code="y">1985</subfield>
</datafield>
</record>""")
def test_wrong_replacement(self):
"""Wrong replacement
A. J. Hey, Gauge by Astron.J. Hey
"""
ref_line = u"""[5] I. J. Aitchison and A. J. Hey, Gauge Theories in Particle Physics, Vol II: QCD and the Electroweak Theory. CRC Pr I Llc, 2003."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">5</subfield>
<subfield code="h">I. J. Aitchison and A. J. Hey</subfield>
<subfield code="p">CRC Pr.</subfield>
</datafield>
</record>""")
def test_author_replacement(self):
ref_line = u"""[48] D. Adams, S. Asai, D. Cavalli, M. Du ̈hrssen, K. Edmonds, S. Elles, M. Fehling, U. Felzmann, L. Gladilin, L. Helary, M. Hohlfeld, S. Horvat, K. Jakobs, M. Kaneda, G. Kirsch, S. Kuehn, J. F. Marchand, C. Pizio, X. Portell, D. Rebuzzi, E. Schmidt, A. Shibata, I. Vivarelli, S. Winkelmann, and S. Yamamoto, The ATLFAST-II performance in release 14 -particle signatures and selected benchmark processes-, Tech. Rep. ATL-PHYS-INT-2009-110, CERN, Geneva, Dec, 2009."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">48</subfield>
<subfield code="h">D. Adams, S. Asai, D. Cavalli, M. D\xfchrssen, K. Edmonds, S. Elles, M. Fehling, U. Felzmann, L. Gladilin, L. Helary, M. Hohlfeld, S. Horvat, K. Jakobs, M. Kaneda, G. Kirsch, S. Kuehn, J. F. Marchand, C. Pizio, X. Portell, D. Rebuzzi, E. Schmidt, A. Shibata, I. Vivarelli, S. Winkelmann, and S. Yamamoto</subfield>
<subfield code="r">ATL-PHYS-INT-2009-110</subfield>
</datafield>
</record>""")
def test_author_not_recognized1(self):
ref_line = u"""[7] Pod I., C. Jennings, et al, etc., Nucl. Phys. B342, 15 (1990)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">7</subfield>
<subfield code="h">Pod I., C. Jennings, et al.</subfield>
<subfield code="s">Nucl.Phys.,B342,15</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_title_comma(self):
ref_line = u"""[24] R. Downing et al., Nucl. Instrum. Methods, A570, 36 (2007)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">24</subfield>
<subfield code="h">R. Downing et al.</subfield>
<subfield code="s">Nucl.Instrum.Meth.,A570,36</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_author1(self):
ref_line = u"""[43] L.S. Durkin and P. Langacker, Phys. Lett B166, 436 (1986); Amaldi et al., Phys. Rev. D36, 1385 (1987); Hayward and Yellow et al., Phys. Lett B245, 669 (1990); Nucl. Phys. B342, 15 (1990);"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">43</subfield>
<subfield code="h">L.S. Durkin and P. Langacker</subfield>
<subfield code="s">Phys.Lett.,B166,436</subfield>
<subfield code="y">1986</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">43</subfield>
<subfield code="h">Amaldi et al.</subfield>
<subfield code="s">Phys.Rev.,D36,1385</subfield>
<subfield code="y">1987</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">43</subfield>
<subfield code="h">Hayward and Yellow et al.</subfield>
<subfield code="s">Phys.Lett.,B245,669</subfield>
<subfield code="y">1990</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">43</subfield>
<subfield code="s">Nucl.Phys.,B342,15</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_author2(self):
ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); T.G. Rizzo, Phys. Rev. D40, 3035 (1989); Proceedings of the 1990 Summer Study on High Energy Physics. ed E. Berger, June 25-July 13, 1990, Snowmass Colorado (World Scientific, Singapore, 1992) p. 233; V. Barger, J.L. Hewett and T.G. Rizzo, Phys. Rev. D42, 152 (1990); J.L. Hewett, Phys. Lett. B238, 98 (1990)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="s">Nucl.Phys.,B372,3</subfield>
<subfield code="y">1992</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">T.G. Rizzo</subfield>
<subfield code="s">Phys.Rev.,D40,3035</subfield>
<subfield code="y">1989</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">(E. Berger (eds.))</subfield>
<subfield code="p">World Scientific</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">V. Barger, J.L. Hewett and T.G. Rizzo</subfield>
<subfield code="s">Phys.Rev.,D42,152</subfield>
<subfield code="y">1990</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">J.L. Hewett</subfield>
<subfield code="s">Phys.Lett.,B238,98</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_merging(self):
"""Test how references are merged together
We may choose to merge invalid references to the previous one"""
ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); T.G. Rizzo, Phys. Rev. D40, 3035 (1989); Proceedings of the 1990 Summer Study on High Energy Physics; ed E. Berger; V. Barger, J.L. Hewett and T.G. Rizzo ; Phys. Rev. D42, 152 (1990); J.L. Hewett, Phys. Lett. B238, 98 (1990)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="s">Nucl.Phys.,B372,3</subfield>
<subfield code="y">1992</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">T.G. Rizzo</subfield>
<subfield code="s">Phys.Rev.,D40,3035</subfield>
<subfield code="y">1989</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="m">Proceedings of the 1990 Summer Study on High Energy Physics</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">(E. Berger (eds.))</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">V. Barger, J.L. Hewett and T.G. Rizzo</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="s">Phys.Rev.,D42,152</subfield>
<subfield code="y">1990</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">J.L. Hewett</subfield>
<subfield code="s">Phys.Lett.,B238,98</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_merging2(self):
ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); hello world"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="s">Nucl.Phys.,B372,3</subfield>
<subfield code="y">1992</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="m">hello world</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_merging3(self):
ref_line = u"""[15] Nucl. Phys., B372, 3 (1992); hello world T.G. Rizzo foo"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="s">Nucl.Phys.,B372,3</subfield>
<subfield code="y">1992</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="m">hello world foo</subfield>
<subfield code="h">T.G. Rizzo</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_merging4(self):
ref_line = u"""[15] T.G. Rizzo; Nucl. Phys., B372, 3 (1992)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="h">T.G. Rizzo</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">15</subfield>
<subfield code="s">Nucl.Phys.,B372,3</subfield>
<subfield code="y">1992</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_merging5(self):
ref_line = u"""[39] C. Arnaboldi et al., Nucl. Instrum. Meth. A 518 (2004) 775
[hep-ex/0212053]; M. Sisti [CUORE Collaboration], J. Phys. Conf. Ser. 203 (2010)
012069; F. Bellini, C. Bucci, S. Capelli, O. Cremonesi, L. Gironi, M. Martinez, M. Pavan
and C. Tomei et al., Astropart. Phys. 33 (2010) 169 [arXiv:0912.0452 [physics.ins-det]]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">C. Arnaboldi et al.</subfield>
<subfield code="s">Nucl.Instrum.Meth.,A518,775</subfield>
<subfield code="r">hep-ex/0212053</subfield>
<subfield code="y">2004</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">M. Sisti</subfield>
<subfield code="c">CUORE Collaboration</subfield>
<subfield code="m">J. Phys. Conf. Ser. 203 (2010)012069</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">F. Bellini, C. Bucci, S. Capelli, O. Cremonesi, L. Gironi, M. Martinez, M. Pavanand C. Tomei et al.</subfield>
<subfield code="m">Astropart. Phys. 33 (2010) 169</subfield>
<subfield code="r">arXiv:0912.0452 [physics.ins-det]</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_extra_blank_reference(self):
ref_line = u"""[26] U. Gursoy and E. Kiritsis, “Exploring improved holographic theories for QCD: Part I,” JHEP 0802 (2008) 032 [ArXiv:0707.1324][hep-th]; U. Gursoy, E. Kiritsis and F. Nitti, “Exploring improved holographic theories for QCD: Part II,” JHEP 0802 (2008) 019 [ArXiv:0707.1349][hep-th];"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">26</subfield>
<subfield code="h">U. Gursoy and E. Kiritsis</subfield>
<subfield code="t">Exploring improved holographic theories for QCD: Part I</subfield>
<subfield code="s">JHEP,0802,032</subfield>
<subfield code="r">arXiv:0707.1324</subfield>
<subfield code="m">[hep-th]</subfield>
<subfield code="y">2008</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">26</subfield>
<subfield code="h">U. Gursoy, E. Kiritsis and F. Nitti</subfield>
<subfield code="t">Exploring improved holographic theories for QCD: Part II</subfield>
<subfield code="s">JHEP,0802,019</subfield>
<subfield code="r">arXiv:0707.1349</subfield>
<subfield code="m">[hep-th]</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_invalid_author(self):
"""used to detected invalid author as at Finite T"""
ref_line = u"""[23] A. Taliotis, “qq ̄ Potential at Finite T and Weak Coupling in N = 4,” Phys. Rev. C83, 045204 (2011). [ArXiv:1011.6618][hep-th]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">23</subfield>
<subfield code="h">A. Taliotis</subfield>
<subfield code="t">qq \u0304 Potential at Finite T and Weak Coupling in N = 4</subfield>
<subfield code="s">Phys.Rev.,C83,045204</subfield>
<subfield code="r">arXiv:1011.6618</subfield>
<subfield code="y">2011</subfield>
</datafield>
</record>""")
def test_split_arxiv(self):
"""used to split arxiv reference from its reference"""
ref_line = u"""[18] A. Taliotis, “DIS from the AdS/CFT correspondence,” Nucl. Phys. A830, 299C-302C (2009). [ArXiv:0907.4204][hep-th]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">18</subfield>
<subfield code="h">A. Taliotis</subfield>
<subfield code="t">DIS from the AdS/CFT correspondence</subfield>
<subfield code="s">Nucl.Phys.,A830,299C</subfield>
<subfield code="r">arXiv:0907.4204</subfield>
<subfield code="y">2009</subfield>
</datafield>
</record>""")
def test_report_without_dash(self):
ref_line = u"""[20] G. Duckeck et al., “ATLAS computing: Technical design report,” CERN-LHCC2005-022."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">20</subfield>
<subfield code="h">G. Duckeck et al.</subfield>
<subfield code="t">ATLAS computing: Technical design report</subfield>
<subfield code="r">CERN-LHCC-2005-022</subfield>
</datafield>
</record>""")
def test_report_with_slashes(self):
ref_line = u"""[20] G. Duckeck et al., “ATLAS computing: Technical design report,” CERN/LHCC/2005-022."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">20</subfield>
<subfield code="h">G. Duckeck et al.</subfield>
<subfield code="t">ATLAS computing: Technical design report</subfield>
<subfield code="r">CERN-LHCC-2005-022</subfield>
</datafield>
</record>""")
def test_ed_before_et_al(self):
ref_line = u"""[20] G. Duckeck, (ed. ) et al., “ATLAS computing: Technical design report,” CERN-LHCC-2005-022."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">20</subfield>
<subfield code="h">G. Duckeck, (ed.) et al.</subfield>
<subfield code="t">ATLAS computing: Technical design report</subfield>
<subfield code="r">CERN-LHCC-2005-022</subfield>
</datafield>
</record>""")
def test_journal_but_no_page(self):
ref_line = u"""[20] G. Duckeck, “ATLAS computing: Technical design report,” JHEP,03,1988"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">20</subfield>
<subfield code="h">G. Duckeck</subfield>
<subfield code="t">ATLAS computing: Technical design report</subfield>
</datafield>
</record>""")
def test_isbn1(self):
ref_line = u"""[22] B. Crowell, Vibrations and Waves. www.lightandmatter.com, 2009. ISBN 0-9704670-3-6."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">22</subfield>
<subfield code="h">B. Crowell</subfield>
<subfield code="i">0-9704670-3-6</subfield>
</datafield>
</record>""")
def test_isbn2(self):
ref_line = u"""[119] D. E. Gray, American Institute of Physics Handbook. Mcgraw-Hill, 3rd ed., 1972. ISBN 9780070014855."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">119</subfield>
<subfield code="h">D. E. Gray</subfield>
<subfield code="p">McGraw-Hill</subfield>
<subfield code="i">9780070014855</subfield>
</datafield>
</record>""")
def test_book(self):
ref_line = u"""[1] D. Griffiths, “Introduction to elementary particles,” Weinheim, USA: Wiley-VCH (2008) 454 p."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">D. Griffiths</subfield>
<subfield code="p">Wiley-VCH</subfield>
<subfield code="t">Introduction to elementary particles</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_complex_arxiv(self):
ref_line = u"""[4] J.Prat, arXiv:1012.3675v1 [physics.ins-det]"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="h">J.Prat</subfield>
<subfield code="r">arXiv:1012.3675 [physics.ins-det]</subfield>
</datafield>
</record>""")
def test_new_arxiv(self):
ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [0802.2879]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">178</subfield>
<subfield code="h">D. R. Tovey</subfield>
<subfield code="s">JHEP,0804,034</subfield>
<subfield code="r">arXiv:0802.2879</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_new_arxiv2(self):
ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [9112.2879]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">178</subfield>
<subfield code="h">D. R. Tovey</subfield>
<subfield code="s">JHEP,0804,034</subfield>
<subfield code="r">arXiv:9112.2879</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_new_arxiv3(self):
ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [1212.2879]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">178</subfield>
<subfield code="h">D. R. Tovey</subfield>
<subfield code="s">JHEP,0804,034</subfield>
<subfield code="r">arXiv:1212.2879</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_new_arxiv_invalid(self):
ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [9002.2879]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">178</subfield>
<subfield code="h">D. R. Tovey</subfield>
<subfield code="s">JHEP,0804,034</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_new_arxiv_invalid2(self):
ref_line = u"""[178] D. R. Tovey, On measuring the masses of pair-produced semi-invisibly decaying particles at hadron colliders, JHEP 04 (2008) 034, [9113.2879]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">178</subfield>
<subfield code="h">D. R. Tovey</subfield>
<subfield code="s">JHEP,0804,034</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_special_journals(self):
ref_line = u"""[178] D. R. Tovey, JHEP 04 (2008) 034"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">178</subfield>
<subfield code="h">D. R. Tovey</subfield>
<subfield code="s">JHEP,0804,034</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_unrecognized_author(self):
ref_line = u"""[27] B. Feng, Y. -H. He, P. Fre', "On correspondences between toric singularities and (p,q) webs," Nucl. Phys. B701 (2004) 334-356. [hep-th/0403133]"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">27</subfield>
<subfield code="h">B. Feng, Y. -H. He, P. Fre'</subfield>
<subfield code="t">On correspondences between toric singularities and (p,q) webs</subfield>
<subfield code="s">Nucl.Phys.,B701,334</subfield>
<subfield code="r">hep-th/0403133</subfield>
<subfield code="y">2004</subfield>
</datafield>
</record>""")
def test_unrecognized_author2(self):
ref_line = u"""[75] J. M. Figueroa-O’Farrill, J. M. Figueroa-O'Farrill, C. M. Hull and B. J. Spence, "Branes at conical singularities and holography," Adv. Theor. Math. Phys. 2, 1249 (1999) [arXiv:hep-th/9808014]"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">75</subfield>
<subfield code="h">J. M. Figueroa-O’Farrill, J. M. Figueroa-O'Farrill, C. M. Hull and B. J. Spence</subfield>
<subfield code="t">Branes at conical singularities and holography</subfield>
<subfield code="s">Adv.Theor.Math.Phys.,2,1249</subfield>
<subfield code="r">hep-th/9808014</subfield>
<subfield code="y">1999</subfield>
</datafield>
</record>""")
def test_pos(self):
ref_line = u"""[23] M. A. Donnellan, et al., PoS LAT2007 (2007) 369."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">23</subfield>
<subfield code="h">M. A. Donnellan, et al.</subfield>
<subfield code="s">PoS,LAT2007,369</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_pos2(self):
ref_line = u"""[23] M. A. Donnellan, et al., PoS LAT2007 2007 369."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">23</subfield>
<subfield code="h">M. A. Donnellan, et al.</subfield>
<subfield code="s">PoS,LAT2007,369</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_pos3(self):
ref_line = u"""[23] M. A. Donnellan, et al., PoS(LAT2005)239."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">23</subfield>
<subfield code="h">M. A. Donnellan, et al.</subfield>
<subfield code="s">PoS,LAT2005,239</subfield>
<subfield code="y">2005</subfield>
</datafield>
</record>""")
def test_pos4(self):
ref_line = u"""[23] PoS CHARGED 2010, 030 (2010)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">23</subfield>
<subfield code="s">PoS,CHARGED2010,030</subfield>
<subfield code="y">2010</subfield>
</datafield>
</record>""")
def test_complex_author(self):
ref_line = u"""[39] Michael E. Peskin, Michael E. Peskin and Michael E. Peskin “An Introduction To Quantum Field Theory,” Westview Press, 1995."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">Michael E. Peskin, Michael E. Peskin and Michael E. Peskin</subfield>
<subfield code="t">An Introduction To Quantum Field Theory</subfield>
</datafield>
</record>""")
def test_complex_author2(self):
ref_line = u"""[39] Dan V. Schroeder, Dan V. Schroeder and Dan V. Schroeder “An Introduction To Quantum Field Theory,” Westview Press, 1995."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">Dan V. Schroeder, Dan V. Schroeder and Dan V. Schroeder</subfield>
<subfield code="t">An Introduction To Quantum Field Theory</subfield>
</datafield>
</record>""")
def test_dan_journal(self):
ref_line = u"""[39] Michael E. Peskin and Dan V. Schroeder “An Introduction To Quantum Field Theory,” Westview Press, 1995."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">Michael E. Peskin and Dan V. Schroeder</subfield>
<subfield code="t">An Introduction To Quantum Field Theory</subfield>
</datafield>
</record>""")
def test_dan_journal2(self):
ref_line = u"""[39] Dan V. Schroeder DAN B701 (2004) 334-356"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">39</subfield>
<subfield code="h">Dan V. Schroeder</subfield>
<subfield code="s">Dokl.Akad.Nauk Ser.Fiz.,B701,334</subfield>
<subfield code="y">2004</subfield>
</datafield>
</record>""")
def test_query_in_url(self):
ref_line = u"""[69] ATLAS Collaboration. Mutag. http://indico.cern.ch/getFile.py/access?contribId=9&resId=1&materialId=slides&confId=35502"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">69</subfield>
<subfield code="c">ATLAS Collaboration</subfield>
<subfield code="u">http://indico.cern.ch/getFile.py/access?contribId=9&amp;resId=1&amp;materialId=slides&amp;confId=35502</subfield>
</datafield>
</record>""")
def test_volume_colon_page(self):
ref_line = u"""[77] J. M. Butterworth et al. Multiparton interactions in photoproduction at hera. Z.Phys.C72:637-646,1996."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">77</subfield>
<subfield code="h">J. M. Butterworth et al.</subfield>
<subfield code="s">Z.Phys.,C72,637</subfield>
<subfield code="y">1996</subfield>
</datafield>
</record>""")
def test_no_spaces_numeration(self):
ref_line = u"""[1] I.M. Gregor et al, Optical links for the ATLAS SCT and Pixel detector, Z.Phys. 465(2001)131-134"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">I.M. Gregor et al.</subfield>
<subfield code="s">Z.Phys.,465,131</subfield>
<subfield code="y">2001</subfield>
</datafield>
</record>""")
def test_dot_after_year(self):
ref_line = u"""[1] Neutrino Mass and New Physics, Phys.Rev. 2006. 56:569-628"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="s">Phys.Rev.,56,569</subfield>
<subfield code="y">2006</subfield>
</datafield>
</record>""")
def test_journal_roman(self):
ref_line = u"""[19] D. Page and C. Pope, Commun. Math. Phys. VI (1990) 529."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">19</subfield>
<subfield code="h">D. Page and C. Pope</subfield>
<subfield code="s">Commun.Math.Phys.,6,529</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_journal_phys_rev_d(self):
ref_line = u"""[6] Sivers D. W., Phys. Rev.D, 41 (1990) 83"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="s">Phys.Rev.,D41,83</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_publisher(self):
ref_line = u"""[6] Sivers D. W., BrAnS Hello"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="p">Brans</subfield>
</datafield>
</record>""")
def test_hep_formatting(self):
ref_line = u"""[6] Sivers D. W., hep-ph-9711200"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="r">hep-ph/9711200</subfield>
</datafield>
</record>""")
def test_hep_formatting2(self):
ref_line = u"""[6] Sivers D. W., astro-ph-9711200"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="r">astro-ph/9711200</subfield>
</datafield>
</record>""")
def test_nucl_phys_b_removal(self):
ref_line = u"""[6] Sivers D. W., Nucl. Phys. (Proc.Suppl.) B21 (2004) 334-356"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="s">Nucl.Phys.Proc.Suppl.,21,334</subfield>
<subfield code="y">2004</subfield>
</datafield>
</record>""")
def test_citations_splitting(self):
ref_line = u"""[6] Sivers D. W., CERN-EX-0106015, D. Page, CERN-EX-0104007"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="r">CERN-EX-0106015</subfield>
<subfield code="0">1</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">D. Page</subfield>
<subfield code="r">CERN-EX-0104007</subfield>
<subfield code="0">2</subfield>
</datafield>
</record>""")
def test_citations_splitting2(self):
ref_line = u"""[6] Sivers D. W., hep-ex/0201013, D. Page, CERN-EP-2001-094"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="r">hep-ex/0201013</subfield>
<subfield code="r">CERN-EP-2001-094</subfield>
<subfield code="0">10</subfield>
</datafield>
</record>""")
def test_arxiv_report_number(self):
"""Should be recognized by arxiv regexps list
(not in report-numbers.kb)
"""
ref_line = u"""[6] Sivers D. W., math.AA/0101888"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="r">math.AA/0101888</subfield>
</datafield>
</record>""")
def test_arxiv_report_number2(self):
""": instead of / in arxiv report number"""
ref_line = u"""[12] C. T. Hill and E. H. Simmons, Phys. Rept. 381: 235-402 (2003), Erratum-ibid. 390: 553-554 (2004) [arXiv: hep-ph:0203079]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="h">C. T. Hill and E. H. Simmons</subfield>
<subfield code="r">hep-ph/0203079</subfield>
</datafield>
</record>""")
def test_arxiv_report_number3(self):
""": instead of / in arxiv report number"""
ref_line = u"""[12] hep-ph/0203079v1"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="r">hep-ph/0203079</subfield>
</datafield>
</record>""")
def test_arxiv_report_number4(self):
""": instead of / in arxiv report number"""
ref_line = u"""[12] hep-ph/0203079invalid"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="m">hep-ph/0203079invalid</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_arxiv_not_parsed(self):
ref_line = u"""[12] arXiv: 0701034 [hep-ph]"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="r">hep-ph/0701034</subfield>
</datafield>
</record>""")
def test_arxiv_report_number_replacement(self):
"""Should be replaced by a valid arxiv report number"""
ref_line = u"""[6] Sivers D. W., astro-phy/0101888"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Sivers D. W.</subfield>
<subfield code="r">astro-ph/0101888</subfield>
</datafield>
</record>""")
def test_only_report_number(self):
ref_line = u"""[6] ATL-PHYS-INT-2009-110"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="r">ATL-PHYS-INT-2009-110</subfield>
</datafield>
</record>""")
def test_only_journal(self):
ref_line = u"""[6] Phys. Rev.D, 41 (1990) 83"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="s">Phys.Rev.,D41,83</subfield>
<subfield code="y">1990</subfield>
</datafield>
</record>""")
def test_only_doi(self):
ref_line = u"""[6] doi:10.1007/s10440-008-9280-9"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="a">10.1007/s10440-008-9280-9</subfield>
</datafield>
</record>""")
def test_reference_size_limit_check_valid_in_one_line(self):
from invenio.refextract_api import extract_references_from_string
ref_line = u"""[1] D. Adams, S. Asai, D. Cavalli, K. Edmonds,
The ATLFAST-II performance in release 14,
Tech. Rep. ATL-PHYS-INT-2009-110, CERN, Geneva, Dec, 2009.
[2] D. Adams, ATL-PHYS-INT-2009-111"""
record = extract_references_from_string(ref_line)
compare_references(self, record, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">D. Adams, S. Asai, D. Cavalli, K. Edmonds</subfield>
<subfield code="r">ATL-PHYS-INT-2009-110</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">2</subfield>
<subfield code="h">D. Adams</subfield>
<subfield code="r">ATL-PHYS-INT-2009-111</subfield>
</datafield>
</record>""")
def test_reference_size_limit_but_removed_as_invalid(self):
"""Test the removal of references that are more than n lines long
Needs to match test_reference_size_limit_check_valid_in_one_line
above but be on multiple lines
"""
from invenio.refextract_api import extract_references_from_string
ref_line = u"""[1] D. Adams, S. Asai, D. Cavalli, K. Edmonds,
a\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\n
a\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\na\n
The ATLFAST-II performance in release 14,
Tech. Rep. ATL-PHYS-INT-2009-110, CERN, Geneva, Dec, 2009.
[2] D. Adams, ATL-PHYS-INT-2009-111"""
record = extract_references_from_string(ref_line)
compare_references(self, record, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">D. Adams, S. Asai, D. Cavalli, K. Edmonds</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">2</subfield>
<subfield code="h">D. Adams</subfield>
<subfield code="r">ATL-PHYS-INT-2009-111</subfield>
</datafield>
</record>""")
def test_author_tag_inside_quoted(self):
"""Tests embeded tags in quoted text
We want to avoid this
<cds.QUOTED>Electroweak parameters of the Z0 resonance and the Standard
Model <cds.AUTHincl>the LEP Collaborations</cds.AUTHincl></cds.QUOTED>
"""
ref_line = u"""[10] LEP Collaboration, G. Alexander et al., “Electroweak parameters of the Z0 resonance and the Standard Model: the LEP Collaborations,” Phys. Lett. B276 (1992) 247–253."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">10</subfield>
<subfield code="c">LEP Collaboration</subfield>
<subfield code="h">G. Alexander et al.</subfield>
<subfield code="t">Electroweak parameters of the Z0 resonance and the Standard Model: the LEP Collaborations</subfield>
<subfield code="s">Phys.Lett.,B276,247</subfield>
<subfield code="y">1992</subfield>
</datafield>
</record>""")
def test_misparsing_arxiv(self):
ref_line = u"""[21] R. Barlow, Asymmetric errors, eConf C030908 (2003), arXiv:physics/0401042."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">21</subfield>
<subfield code="h">R. Barlow</subfield>
<subfield code="r">physics/0401042</subfield>
</datafield>
</record>""")
def test_no_volume(self):
ref_line = u"""[6] Owen F.N., Rudnick L., 1976, Phys. Rev., 205L, 1"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Owen F.N., Rudnick L.</subfield>
<subfield code="s">Phys.Rev.,L205,1</subfield>
<subfield code="y">1976</subfield>
</datafield>
</record>""")
def test_numeration_detached(self):
"""Numeration detection check
At some point was reporting two journals, detecting twice the same
numeration
"""
ref_line = u"""[6] B. Friman, in The CBM Phys. Rev. book: Compressed baryonic matter in laboratory, Phys. Rev. 814, 1 (2011)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">B. Friman</subfield>
<subfield code="s">Phys.Rev.,814,1</subfield>
<subfield code="y">2011</subfield>
</datafield>
</record>""")
def test_no_volume2(self):
"""At some point failed to report volume correctly"""
ref_line = u"""[3] S. Sarkar, Nucl. Phys. A 862-863, 13 (2011)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">3</subfield>
<subfield code="h">S. Sarkar</subfield>
<subfield code="s">Nucl.Phys.,A862,13</subfield>
<subfield code="y">2011</subfield>
</datafield>
</record>""")
def test_journal_title_mangled(self):
"""Makes sure this journal gets confused with an author"""
ref_line = u"""[12] K. G. Chetyrkin and A. Khodjamirian, Eur. Phys. J. C46 (2006)
721"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">12</subfield>
<subfield code="h">K. G. Chetyrkin and A. Khodjamirian</subfield>
<subfield code="s">Eur.Phys.J.,C46,721</subfield>
<subfield code="y">2006</subfield>
</datafield>
</record>""")
def test_volume_letter_goes_missing(self):
ref_line = u"""[6] N. Cabibbo and G. Parisi, Phys. Lett. 59 B (1975) 67."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">N. Cabibbo and G. Parisi</subfield>
<subfield code="s">Phys.Lett.,B59,67</subfield>
<subfield code="y">1975</subfield>
</datafield>
</record>""")
def test_removed_dot_in_authors(self):
ref_line = u"""[6] Cabibbo N. and Parisi G.: Phys. Lett. 59 B (1975) 67."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">6</subfield>
<subfield code="h">Cabibbo N. and Parisi G.</subfield>
<subfield code="s">Phys.Lett.,B59,67</subfield>
<subfield code="y">1975</subfield>
</datafield>
</record>""")
def test_author_with_accents(self):
ref_line = u"""[1] Ôrlo A., Eur. Phys. J. C46 (2006) 721"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">Ôrlo A.</subfield>
<subfield code="s">Eur.Phys.J.,C46,721</subfield>
<subfield code="y">2006</subfield>
</datafield>
</record>""")
def test_implied_ibid(self):
ref_line = u"""[4] S. F. King and G. G. Ross, Phys. Lett. B 520, 243 (2001); 574, 239 (2003)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="h">S. F. King and G. G. Ross</subfield>
<subfield code="s">Phys.Lett.,B520,243</subfield>
<subfield code="y">2001</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="s">Phys.Lett.,B574,239</subfield>
<subfield code="y">2003</subfield>
</datafield>
</record>""")
def test_implied_ibid2(self):
ref_line = u"""[4] S. F. King and G. G. Ross, Phys. Lett. B 520, 243 (2001); C574, 239 (2003)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="h">S. F. King and G. G. Ross</subfield>
<subfield code="s">Phys.Lett.,B520,243</subfield>
<subfield code="y">2001</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="s">Phys.Lett.,C574,239</subfield>
<subfield code="y">2003</subfield>
</datafield>
</record>""")
def test_implied_ibid3(self):
ref_line = u"""[4] S. F. King and G. G. Ross, Phys. Lett. B 520, 243 (2001); 574, 239 (2003); 575, 240 (2004); 576, 241 (2005)"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="h">S. F. King and G. G. Ross</subfield>
<subfield code="s">Phys.Lett.,B520,243</subfield>
<subfield code="y">2001</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="s">Phys.Lett.,B574,239</subfield>
<subfield code="y">2003</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="s">Phys.Lett.,B575,240</subfield>
<subfield code="y">2004</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">4</subfield>
<subfield code="s">Phys.Lett.,B576,241</subfield>
<subfield code="y">2005</subfield>
</datafield>
</record>""")
def test_implied_ibid4(self):
ref_line = u"""[10] R. Foot, H.N. Long and T.A. Tran, Phys. Rev. D50, R34 (1994); H.N. Long, ibid. 53, 437 (1996); 54, 4691 (1996)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">10</subfield>
<subfield code="h">R. Foot, H.N. Long and T.A. Tran</subfield>
<subfield code="s">Phys.Rev.,D50,R34</subfield>
<subfield code="y">1994</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">10</subfield>
<subfield code="h">H.N. Long</subfield>
<subfield code="s">Phys.Rev.,D53,437</subfield>
<subfield code="y">1996</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">10</subfield>
<subfield code="s">Phys.Rev.,D54,4691</subfield>
<subfield code="y">1996</subfield>
</datafield>
</record>""")
def test_report_number(self):
ref_line = u"""[10] [physics.plasm-ph/0409093]."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">10</subfield>
<subfield code="r">physics.plasm-ph/0409093</subfield>
</datafield>
</record>""")
def test_journal2(self):
ref_line = u"""[1] Phys.Rev. A, : 78 (2008) 012115"""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="s">Phys.Rev.,A78,012115</subfield>
<subfield code="y">2008</subfield>
</datafield>
</record>""")
def test_authors_merge(self):
ref_line = u"""[44] R. Baier et al., Invalid. Hello. Lett. B 345 (1995)."""
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">44</subfield>
<subfield code="h">R. Baier et al.</subfield>
<subfield code="m">Invalid. Hello. Lett. B 345 (1995)</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_atlas_conf_99(self):
ref_line = u'[14] ATLAS-CONF-99-078'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="r">ATL-CONF-99-078</subfield>
</datafield>
</record>""")
def test_atlas_conf_pre_2010(self):
ref_line = u'[14] ATL-CONF-2003-078'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="r">ATL-CONF-2003-078</subfield>
</datafield>
</record>""")
def test_atlas_conf_pre_2010_2(self):
ref_line = u'[14] ATLAS-CONF-2003-078'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="r">ATL-CONF-2003-078</subfield>
</datafield>
</record>""")
def test_atlas_conf_post_2010(self):
ref_line = u'[14] ATLAS-CONF-2012-078'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="r">ATLAS-CONF-2012-078</subfield>
</datafield>
</record>""")
def test_atlas_conf_post_2010_2(self):
ref_line = u'[14] ATL-CONF-2012-078'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
<subfield code="r">ATLAS-CONF-2012-078</subfield>
</datafield>
</record>""")
def test_atlas_conf_post_2010_invalid(self):
ref_line = u'[14] ATL-CONF-2012-0784'
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">14</subfield>
</datafield>
</record>""")
def test_journal_missed(self):
ref_line = u"[1] M. G. Mayer, Phys. Rev. 75 (1949), 1969; O. Hazel, J. H. D. Jensen, and H. E. Suess, Phys. Rev. 75 (1949), 1766."
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">M. G. Mayer</subfield>
<subfield code="s">Phys.Rev.,75,1969</subfield>
<subfield code="y">1949</subfield>
</datafield>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">1</subfield>
<subfield code="h">O. Hazel, J. H. D. Jensen, and H. E. Suess</subfield>
<subfield code="s">Phys.Rev.,75,1766</subfield>
<subfield code="y">1949</subfield>
</datafield>
</record>""")
def test_invalid_publisher(self):
"""test_invalid_publisher
This needs to not consider the lbl in Hoelbling as a publisher"""
ref_line = u"[35] G. I. Egri, Z. Fodor, C. Hoelbling, S. D. Katz, D. Nógrádi, et. al., Lattice QCD as a video game, Comput.Phys.Commun. 177 (2007) 631–639, [hep-lat/0611022]."
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">35</subfield>
<subfield code="h">G. I. Egri, Z. Fodor, C. Hoelbling, S. D. Katz, D. N\xf3gr\xe1di, et al.</subfield>
<subfield code="r">hep-lat/0611022</subfield>
</datafield>
</record>""")
def test_valid_publisher(self):
"""test_invalid_publisher
This needs to not consider the lbl in Hoelbling as a publisher"""
ref_line = u"[35] [LBL]"
_reference_test(self, ref_line, u"""<record>
<datafield tag="999" ind1="C" ind2="5">
<subfield code="o">35</subfield>
<subfield code="p">LBL</subfield>
</datafield>
</record>""")
def test_missed_collaboration(self):
ref_line = u"""[76] these results replace the Λb → J/ψΛ and B0 → J/ψKS lifetime measurements of A. Abulencia et al. (CDF collaboration), Phys. Rev. Lett. 98, 122001 (2007), arXiv:hep-ex/0609021, as well as the B0 → J/ψK∗0"""
_reference_test(self, ref_line, u"""<record>
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">76</subfield>
<subfield code="h">A. Abulencia et al.</subfield>
<subfield code="c">CDF collaboration</subfield>
<subfield code="s">Phys.Rev.Lett.,98,122001</subfield>
<subfield code="r">hep-ex/0609021</subfield>
<subfield code="y">2007</subfield>
</datafield>
</record>""")
def test_remove_duplicate_doi(self):
ref_line = u"""[1] doi:10.1007/s10440-008-9280-9 doi:10.1007/s10440-008-9280-9"""
_reference_test(self, ref_line, u"""<record>
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">1</subfield>
<subfield code="a">10.1007/s10440-008-9280-9</subfield>
</datafield>
</record>""")
def test_leftover_tag(self):
ref_line = u"""[2] ΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦΦ^ E. Dudas, G. von Gersdorff, J. Parmentier and S. Pokorski, arXiv:1007.5208."""
_reference_test(self, ref_line, u"""<record>
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">2</subfield>
<subfield code="h">E. Dudas, G. von Gersdorff, J. Parmentier and S. Pokorski</subfield>
<subfield code="r">arXiv:1007.5208</subfield>
</datafield>
</record>""")
def test_leftover_number(self):
"""test_leftover_number
The result was
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">2</subfield>
<subfield code="m">9</subfield>
<subfield code="r">CERN-LHCC-2011-999</subfield>
</datafield>
"""
ref_line = u"""[2] CERN-LHCC2011-999"""
_reference_test(self, ref_line, u"""<record>
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">2</subfield>
<subfield code="r">CERN-LHCC-2011-999</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_leftover_brackets(self):
ref_line = u"""[2] [CERN-LHCC2011-999]"""
_reference_test(self, ref_line, u"""<record>
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">2</subfield>
<subfield code="r">CERN-LHCC-2011-999</subfield>
</datafield>
</record>""", ignore_misc=False)
def test_valid_utf_8(self):
"""Checks that the utf-8 characters are kept"""
ref_line = u"""[2] "Λb → J/ψΛ and B0 → J/ψKS" """
_reference_test(self, ref_line, u"""<record>
<datafield ind1="C" ind2="5" tag="999">
<subfield code="o">2</subfield>
<subfield code="t">Λb → J/ψΛ and B0 → J/ψKS</subfield>
</datafield>
</record>""")
class TaskTest(InvenioTestCase):
def setUp(self):
setup_loggers(verbosity=0)
def test_task_run_core(self):
from invenio.refextract_task import task_run_core
- task_run_core(1)
+ task_run_core(1, [])
TEST_SUITE = make_test_suite(RefextractTest)
if __name__ == '__main__':
run_test_suite(TEST_SUITE, warn_user=True)
diff --git a/modules/docextract/lib/refextract_tag.py b/modules/docextract/lib/refextract_tag.py
index fce58abee..8facf4c78 100644
--- a/modules/docextract/lib/refextract_tag.py
+++ b/modules/docextract/lib/refextract_tag.py
@@ -1,1406 +1,1410 @@
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
import re
try:
from unidecode import unidecode
UNIDECODE_AVAILABLE = True
except ImportError:
UNIDECODE_AVAILABLE = False
from invenio.refextract_config import \
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, \
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, \
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \
CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, \
CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID, \
CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION, \
CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION
from invenio.docextract_text import remove_and_record_multiple_spaces_in_line
from invenio.refextract_re import \
re_ibid, \
re_doi, \
re_raw_url, \
re_series_from_numeration, \
re_punctuation, \
re_correct_numeration_2nd_try_ptn1, \
re_correct_numeration_2nd_try_ptn2, \
re_correct_numeration_2nd_try_ptn3, \
re_correct_numeration_2nd_try_ptn4, \
re_numeration_nucphys_vol_page_yr, \
re_numeration_vol_subvol_nucphys_yr_page, \
re_numeration_nucphys_vol_yr_page, \
re_multiple_hyphens, \
re_numeration_vol_page_yr, \
re_numeration_vol_yr_page, \
re_numeration_vol_nucphys_series_yr_page, \
re_numeration_vol_series_nucphys_page_yr, \
re_numeration_vol_nucphys_series_page_yr, \
re_html_tagged_url, \
re_numeration_yr_vol_page, \
re_numeration_vol_nucphys_page_yr, \
re_wash_volume_tag, \
re_numeration_vol_nucphys_yr_subvol_page, \
re_quoted, \
re_isbn, \
re_arxiv, \
+ re_arxiv_5digits, \
re_new_arxiv, \
+ re_new_arxiv_5digits, \
re_pos, \
re_pos_year_num, \
re_series_from_numeration_after_volume, \
RE_OLD_ARXIV, \
RE_ARXIV_CATCHUP, \
RE_ATLAS_CONF_PRE_2010, \
RE_ATLAS_CONF_POST_2010
from invenio.authorextract_re import (get_author_regexps,
etal_matches,
re_ed_notation,
re_etal)
from invenio.docextract_text import wash_line
def tag_reference_line(line, kbs, record_titles_count):
# take a copy of the line as a first working line, clean it of bad
# accents, and correct puncutation, etc:
working_line1 = wash_line(line)
# Identify volume for POS journal
working_line1 = tag_pos_volume(working_line1)
# Clean the line once more:
working_line1 = wash_line(working_line1)
# We identify quoted text
# This is useful for books matching
# This is also used by the author tagger to remove quoted
# text which is a sign of a title and not an author
working_line1 = tag_quoted_text(working_line1)
# Identify ISBN (for books)
working_line1 = tag_isbn(working_line1)
# Identify arxiv reports
working_line1 = tag_arxiv(working_line1)
working_line1 = tag_arxiv_more(working_line1)
# Identify volume for POS journal
# needs special handling because the volume contains the year
working_line1 = tag_pos_volume(working_line1)
# Identify ATL-CONF and ATLAS-CONF report numbers
# needs special handling because it has 2 formats depending on the year
# and a 2 years digit format to convert
working_line1 = tag_atlas_conf(working_line1)
# Identify journals with regular expression
# Some journals need to match exact regexps because they can
# conflict with other elements
# e.g. DAN is also a common first name
standardised_titles = kbs['journals'][1]
standardised_titles.update(kbs['journals_re'])
journals_matches = identifiy_journals_re(working_line1, kbs['journals_re'])
# Remove identified tags
working_line2 = strip_tags(working_line1)
# Transform the line to upper-case, now making a new working line:
working_line2 = working_line2.upper()
# Strip punctuation from the line:
working_line2 = re_punctuation.sub(u' ', working_line2)
# Remove multiple spaces from the line, recording
# information about their coordinates:
removed_spaces, working_line2 = \
remove_and_record_multiple_spaces_in_line(working_line2)
# Identify and record coordinates of institute preprint report numbers:
found_pprint_repnum_matchlens, found_pprint_repnum_replstr, working_line2 =\
identify_report_numbers(working_line2, kbs['report-numbers'])
# Identify and record coordinates of non-standard journal titles:
journals_matches_more, working_line2, line_titles_count = \
identify_journals(working_line2, kbs['journals'])
journals_matches.update(journals_matches_more)
# Add the count of 'bad titles' found in this line to the total
# for the reference section:
record_titles_count = sum_2_dictionaries(record_titles_count,
line_titles_count)
# Attempt to identify, record and replace any IBIDs in the line:
if (working_line2.upper().find(u"IBID") != -1):
# there is at least one IBID in the line - try to
# identify its meaning:
found_ibids_matchtext, working_line2 = \
identify_ibids(working_line2)
# now update the dictionary of matched title lengths with the
# matched IBID(s) lengths information:
journals_matches.update(found_ibids_matchtext)
publishers_matches = identify_publishers(working_line2, kbs['publishers'])
tagged_line = process_reference_line(
working_line=working_line1,
journals_matches=journals_matches,
pprint_repnum_len=found_pprint_repnum_matchlens,
pprint_repnum_matchtext=found_pprint_repnum_replstr,
publishers_matches=publishers_matches,
removed_spaces=removed_spaces,
standardised_titles=standardised_titles,
kbs=kbs,
)
return tagged_line, record_titles_count
def process_reference_line(working_line,
journals_matches,
pprint_repnum_len,
pprint_repnum_matchtext,
publishers_matches,
removed_spaces,
standardised_titles,
kbs):
"""After the phase of identifying and tagging citation instances
in a reference line, this function is called to go through the
line and the collected information about the recognised citations,
and to transform the line into a string of MARC XML in which the
recognised citations are grouped under various datafields and
subfields, depending upon their type.
@param line_marker: (string) - this is the marker for this
reference line (e.g. [1]).
@param working_line: (string) - this is the line before the
punctuation was stripped. At this stage, it has not been
capitalised, and neither TITLES nor REPORT NUMBERS have been
stripped from it. However, any recognised numeration and/or URLs
have been tagged with <cds.YYYY> tags.
The working_line could, for example, look something like this:
[1] CDS <cds.URL description="http //invenio-software.org/">
http //invenio-software.org/</cds.URL>.
@param found_title_len: (dictionary) - the lengths of the title
citations that have been recognised in the line. Keyed by the index
within the line of each match.
@param found_title_matchtext: (dictionary) - The text that was found
for each matched title citation in the line. Keyed by the index within
the line of each match.
@param pprint_repnum_len: (dictionary) - the lengths of the matched
institutional preprint report number citations found within the line.
Keyed by the index within the line of each match.
@param pprint_repnum_matchtext: (dictionary) - The matched text for each
matched institutional report number. Keyed by the index within the line
of each match.
@param identified_dois (list) - The list of dois inside the citation
@identified_urls: (list) - contains 2-cell tuples, each of which
represents an idenitfied URL and its description string.
The list takes the order in which the URLs were identified in the line
(i.e. first-found, second-found, etc).
@param removed_spaces: (dictionary) - The number of spaces removed from
the various positions in the line. Keyed by the index of the position
within the line at which the spaces were removed.
@param standardised_titles: (dictionary) - The standardised journal
titles, keyed by the non-standard version of those titles.
@return: (tuple) of 5 components:
( string -> a MARC XML-ized reference line.
integer -> number of fields of miscellaneous text marked-up
for the line.
integer -> number of title citations marked-up for the line.
integer -> number of institutional report-number citations
marked-up for the line.
integer -> number of URL citations marked-up for the record.
integer -> number of DOI's found for the record
integer -> number of author groups found
)
"""
if len(journals_matches) + len(pprint_repnum_len) + len(publishers_matches) == 0:
# no TITLE or REPORT-NUMBER citations were found within this line,
# use the raw line: (This 'raw' line could still be tagged with
# recognised URLs or numeration.)
tagged_line = working_line
else:
# TITLE and/or REPORT-NUMBER citations were found in this line,
# build a new version of the working-line in which the standard
# versions of the REPORT-NUMBERs and TITLEs are tagged:
startpos = 0 # First cell of the reference line...
previous_match = {} # previously matched TITLE within line (used
# for replacement of IBIDs.
replacement_types = {}
journals_keys = journals_matches.keys()
journals_keys.sort()
reports_keys = pprint_repnum_matchtext.keys()
reports_keys.sort()
publishers_keys = publishers_matches.keys()
publishers_keys.sort()
spaces_keys = removed_spaces.keys()
spaces_keys.sort()
replacement_types = get_replacement_types(journals_keys,
reports_keys,
publishers_keys)
replacement_locations = replacement_types.keys()
replacement_locations.sort()
tagged_line = u"" # This is to be the new 'working-line'. It will
# contain the tagged TITLEs and REPORT-NUMBERs,
# as well as any previously tagged URLs and
# numeration components.
# begin:
for replacement_index in replacement_locations:
# first, factor in any stripped spaces before this 'replacement'
true_replacement_index, extras = \
account_for_stripped_whitespace(spaces_keys,
removed_spaces,
replacement_types,
pprint_repnum_len,
journals_matches,
replacement_index)
if replacement_types[replacement_index] == u"journal":
# Add a tagged periodical TITLE into the line:
rebuilt_chunk, startpos, previous_match = \
add_tagged_journal(
reading_line=working_line,
journal_info=journals_matches[replacement_index],
previous_match=previous_match,
startpos=startpos,
true_replacement_index=true_replacement_index,
extras=extras,
standardised_titles=standardised_titles)
tagged_line += rebuilt_chunk
elif replacement_types[replacement_index] == u"reportnumber":
# Add a tagged institutional preprint REPORT-NUMBER
# into the line:
rebuilt_chunk, startpos = \
add_tagged_report_number(
reading_line=working_line,
len_reportnum=pprint_repnum_len[replacement_index],
reportnum=pprint_repnum_matchtext[replacement_index],
startpos=startpos,
true_replacement_index=true_replacement_index,
extras=extras)
tagged_line += rebuilt_chunk
elif replacement_types[replacement_index] == u"publisher":
rebuilt_chunk, startpos = \
add_tagged_publisher(
reading_line=working_line,
matched_publisher=publishers_matches[replacement_index],
startpos=startpos,
true_replacement_index=true_replacement_index,
extras=extras,
kb_publishers=kbs['publishers'])
tagged_line += rebuilt_chunk
# add the remainder of the original working-line into the rebuilt line:
tagged_line += working_line[startpos:]
# we have all the numeration
# we can make sure there's no space between the volume
# letter and the volume number
# e.g. B 20 -> B20
tagged_line = wash_volume_tag(tagged_line)
# Try to find any authors in the line
tagged_line = identify_and_tag_authors(tagged_line, kbs['authors'])
# Try to find any collaboration in the line
tagged_line = identify_and_tag_collaborations(tagged_line,
kbs['collaborations'])
return tagged_line.replace('\n', '')
def wash_volume_tag(line):
return re_wash_volume_tag[0].sub(re_wash_volume_tag[1], line)
def tag_isbn(line):
"""Tag books ISBN"""
return re_isbn.sub(ur'<cds.ISBN>\g<code></cds.ISBN>', line)
def tag_quoted_text(line):
"""Tag quoted titles
We use titles for pretty display of references that we could not
associate we record.
We also use titles for recognising books.
"""
return re_quoted.sub(ur'<cds.QUOTED>\g<title></cds.QUOTED>', line)
def tag_arxiv(line):
"""Tag arxiv report numbers
We handle arXiv in 2 ways:
* starting with arXiv:1022.1111
* this format exactly 9999.9999
We also format the output to the standard arxiv notation:
* arXiv:2007.12.1111
* arXiv:2007.12.1111v2
"""
def tagger(match):
groups = match.groupdict()
if match.group('suffix'):
groups['suffix'] = ' ' + groups['suffix']
else:
groups['suffix'] = ''
return u'<cds.REPORTNUMBER>arXiv:%(year)s'\
u'%(month)s.%(num)s%(suffix)s' \
u'</cds.REPORTNUMBER>' % groups
+ line = re_arxiv_5digits.sub(tagger, line)
line = re_arxiv.sub(tagger, line)
+ line = re_new_arxiv_5digits.sub(tagger, line)
line = re_new_arxiv.sub(tagger, line)
return line
def tag_arxiv_more(line):
"""Tag old arxiv report numbers
Either formats:
* hep-th/1234567
* arXiv:1022111 [hep-ph] which transforms to hep-ph/1022111
"""
line = RE_ARXIV_CATCHUP.sub(ur"\g<suffix>/\g<year>\g<month>\g<num>", line)
for report_re, report_repl in RE_OLD_ARXIV:
report_number = report_repl + ur"/\g<num>"
line = report_re.sub(u'<cds.REPORTNUMBER>' + report_number
+ u'</cds.REPORTNUMBER>',
line)
return line
def tag_pos_volume(line):
"""Tag POS volume number
POS is journal that has special volume numbers
e.g. PoS LAT2007 (2007) 369
"""
def tagger(match):
groups = match.groupdict()
try:
year = match.group('year')
except IndexError:
# Extract year from volume name
# which should always include the year
g = re.search(re_pos_year_num, match.group('volume_num'), re.UNICODE)
year = g.group(0)
if year:
groups['year'] = ' <cds.YR>(%s)</cds.YR>' % year.strip().strip('()')
else:
groups['year'] = ''
return '<cds.JOURNAL>PoS</cds.JOURNAL>' \
' <cds.VOL>%(volume_name)s%(volume_num)s</cds.VOL>' \
'%(year)s' \
' <cds.PG>%(page)s</cds.PG>' % groups
for p in re_pos:
line = p.sub(tagger, line)
return line
def tag_atlas_conf(line):
line = RE_ATLAS_CONF_PRE_2010.sub(
ur'<cds.REPORTNUMBER>ATL-CONF-\g<code></cds.REPORTNUMBER>', line)
line = RE_ATLAS_CONF_POST_2010.sub(
ur'<cds.REPORTNUMBER>ATLAS-CONF-\g<code></cds.REPORTNUMBER>', line)
return line
def identifiy_journals_re(line, kb_journals):
matches = {}
for pattern, dummy_journal in kb_journals:
match = re.search(pattern, line)
if match:
matches[match.start()] = match.group(0)
return matches
def find_numeration_more(line):
"""Look for other numeration in line."""
# First, attempt to use marked-up titles
patterns = (
re_correct_numeration_2nd_try_ptn1,
re_correct_numeration_2nd_try_ptn2,
re_correct_numeration_2nd_try_ptn3,
re_correct_numeration_2nd_try_ptn4,
)
for pattern in patterns:
match = pattern.search(line)
if match:
info = match.groupdict()
series = extract_series_from_volume(info['vol'])
if not info['vol_num']:
info['vol_num'] = info['vol_num_alt']
if not info['vol_num']:
info['vol_num'] = info['vol_num_alt2']
return {'year': info.get('year', None),
'series': series,
'volume': info['vol_num'],
'page': info['page'],
'len': len(info['aftertitle'])}
return None
def add_tagged_report_number(reading_line,
len_reportnum,
reportnum,
startpos,
true_replacement_index,
extras):
"""In rebuilding the line, add an identified institutional REPORT-NUMBER
(standardised and tagged) into the line.
@param reading_line: (string) The reference line before capitalization
was performed, and before REPORT-NUMBERs and TITLEs were stipped out.
@param len_reportnum: (integer) the length of the matched REPORT-NUMBER.
@param reportnum: (string) the replacement text for the matched
REPORT-NUMBER.
@param startpos: (integer) the pointer to the next position in the
reading-line from which to start rebuilding.
@param true_replacement_index: (integer) the replacement index of the
matched REPORT-NUMBER in the reading-line, with stripped punctuation
and whitespace accounted for.
@param extras: (integer) extras to be added into the replacement index.
@return: (tuple) containing a string (the rebuilt line segment) and an
integer (the next 'startpos' in the reading-line).
"""
rebuilt_line = u"" # The segment of the line that's being rebuilt to
# include the tagged & standardised REPORT-NUMBER
# Fill rebuilt_line with the contents of the reading_line up to the point
# of the institutional REPORT-NUMBER. However, stop 1 character before the
# replacement index of this REPORT-NUMBER to allow for removal of braces,
# if necessary:
if (true_replacement_index - startpos - 1) >= 0:
rebuilt_line += reading_line[startpos:true_replacement_index - 1]
else:
rebuilt_line += reading_line[startpos:true_replacement_index]
# Add the tagged REPORT-NUMBER into the rebuilt-line segment:
rebuilt_line += u"<cds.REPORTNUMBER>%(reportnum)s</cds.REPORTNUMBER>" \
% {'reportnum' : reportnum}
# Move the pointer in the reading-line past the current match:
startpos = true_replacement_index + len_reportnum + extras
# Move past closing brace for report number (if there was one):
try:
if reading_line[startpos] in (u"]", u")"):
startpos += 1
except IndexError:
# moved past end of line - ignore
pass
# return the rebuilt-line segment and the pointer to the next position in
# the reading-line from which to start rebuilding up to the next match:
return rebuilt_line, startpos
def add_tagged_journal_in_place_of_IBID(previous_match):
"""In rebuilding the line, if the matched TITLE was actually an IBID, this
function will replace it with the previously matched TITLE, and add it
into the line, tagged. It will even handle the series letter, if it
differs. For example, if the previous match is "Nucl. Phys. B", and
the ibid is "IBID A", the title inserted into the line will be
"Nucl. Phys. A". Otherwise, if the IBID had no series letter, it will
simply be replaced by "Nucl. Phys. B" (i.e. the previous match.)
@param previous_match: (string) - the previously matched TITLE.
@param ibid_series: (string) - the series of the IBID (if any).
@return: (tuple) containing a string (the rebuilt line segment) and an
other string (the newly updated previous-match).
"""
return " %s%s%s" % (CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID,
previous_match['title'],
CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID)
def extract_series_from_volume(volume):
patterns = (re_series_from_numeration,
re_series_from_numeration_after_volume)
for p in patterns:
match = p.search(volume)
if match:
return match.group(1)
return None
def create_numeration_tag(info):
if info['series']:
series_and_volume = info['series'] + info['volume']
else:
series_and_volume = info['volume']
numeration_tags = u' <cds.VOL>%s</cds.VOL>' % series_and_volume
if info.get('year', False):
numeration_tags += u' <cds.YR>(%(year)s)</cds.YR>' % info
numeration_tags += u' <cds.PG>%(page)s</cds.PG>' % info
return numeration_tags
def add_tagged_journal(reading_line,
journal_info,
previous_match,
startpos,
true_replacement_index,
extras,
standardised_titles):
"""In rebuilding the line, add an identified periodical TITLE (standardised
and tagged) into the line.
@param reading_line: (string) The reference line before capitalization
was performed, and before REPORT-NUMBERs and TITLEs were stripped out.
@param len_title: (integer) the length of the matched TITLE.
@param matched_title: (string) the matched TITLE text.
@param previous_match: (dict) the previous periodical TITLE citation to
have been matched in the current reference line. It is used when
replacing an IBID instance in the line.
@param startpos: (integer) the pointer to the next position in the
reading-line from which to start rebuilding.
@param true_replacement_index: (integer) the replacement index of the
matched TITLE in the reading-line, with stripped punctuation and
whitespace accounted for.
@param extras: (integer) extras to be added into the replacement index.
@param standardised_titles: (dictionary) the standardised versions of
periodical titles, keyed by their various non-standard versions.
@return: (tuple) containing a string (the rebuilt line segment), an
integer (the next 'startpos' in the reading-line), and an other string
(the newly updated previous-match).
"""
old_startpos = startpos
old_previous_match = previous_match
skip_numeration = False
series = None
def skip_ponctuation(line, pos):
# Skip past any punctuation at the end of the replacement that was
# just made:
try:
while line[pos] in (".", ":", "-", ")"):
pos += 1
except IndexError:
# The match was at the very end of the line
pass
return pos
# Fill 'rebuilt_line' (the segment of the line that is being rebuilt to
# include the tagged and standardised periodical TITLE) with the contents
# of the reading-line, up to the point of the matched TITLE:
rebuilt_line = reading_line[startpos:true_replacement_index]
# Test to see whether a title or an "IBID" was matched:
if journal_info.upper().find("IBID") != -1:
# This is an IBID
# Try to replace the IBID with a title:
if previous_match:
# Replace this IBID with the previous title match, if possible:
rebuilt_line += add_tagged_journal_in_place_of_IBID(previous_match)
series = previous_match['series']
# Update start position for next segment of original line:
startpos = true_replacement_index + len(journal_info) + extras
startpos = skip_ponctuation(reading_line, startpos)
else:
rebuilt_line = ""
skip_numeration = True
else:
if ';' in standardised_titles[journal_info]:
title, series = \
standardised_titles[journal_info].rsplit(';', 1)
series = series.strip()
previous_match = {'title': title,
'series': series}
else:
title = standardised_titles[journal_info]
previous_match = {'title': title,
'series': None}
# This is a normal title, not an IBID
rebuilt_line += "<cds.JOURNAL>%s</cds.JOURNAL>" % title
startpos = true_replacement_index + len(journal_info) + extras
startpos = skip_ponctuation(reading_line, startpos)
if not skip_numeration:
# Check for numeration
numeration_line = reading_line[startpos:]
# First look for standard numeration
numerotation_info = find_numeration(numeration_line)
if not numerotation_info:
numeration_line = rebuilt_line + " " + numeration_line
# Now look for more funky numeration
# With possibly some elements before the journal title
numerotation_info = find_numeration_more(numeration_line)
if not numerotation_info:
startpos = old_startpos
previous_match = old_previous_match
rebuilt_line = ""
else:
if series and not numerotation_info['series']:
numerotation_info['series'] = series
startpos += numerotation_info['len']
rebuilt_line += create_numeration_tag(numerotation_info)
previous_match['series'] = numerotation_info['series']
# return the rebuilt line-segment, the position (of the reading line) from
# which the next part of the rebuilt line should be started, and the newly
# updated previous match.
return rebuilt_line, startpos, previous_match
def add_tagged_publisher(reading_line,
matched_publisher,
startpos,
true_replacement_index,
extras,
kb_publishers):
"""In rebuilding the line, add an identified periodical TITLE (standardised
and tagged) into the line.
@param reading_line: (string) The reference line before capitalization
was performed, and before REPORT-NUMBERs and TITLEs were stripped out.
@param len_title: (integer) the length of the matched TITLE.
@param matched_title: (string) the matched TITLE text.
@param previous_match: (string) the previous periodical TITLE citation to
have been matched in the current reference line. It is used when
replacing an IBID instance in the line.
@param startpos: (integer) the pointer to the next position in the
reading-line from which to start rebuilding.
@param true_replacement_index: (integer) the replacement index of the
matched TITLE in the reading-line, with stripped punctuation and
whitespace accounted for.
@param extras: (integer) extras to be added into the replacement index.
@param standardised_titles: (dictionary) the standardised versions of
periodical titles, keyed by their various non-standard versions.
@return: (tuple) containing a string (the rebuilt line segment), an
integer (the next 'startpos' in the reading-line), and an other string
(the newly updated previous-match).
"""
# Fill 'rebuilt_line' (the segment of the line that is being rebuilt to
# include the tagged and standardised periodical TITLE) with the contents
# of the reading-line, up to the point of the matched TITLE:
rebuilt_line = reading_line[startpos:true_replacement_index]
# This is a normal title, not an IBID
rebuilt_line += "<cds.PUBLISHER>%(title)s</cds.PUBLISHER>" \
% {'title' : kb_publishers[matched_publisher]['repl']}
# Compute new start pos
startpos = true_replacement_index + len(matched_publisher) + extras
# return the rebuilt line-segment, the position (of the reading line) from
# which the next part of the rebuilt line should be started, and the newly
# updated previous match.
return rebuilt_line, startpos
def get_replacement_types(titles, reportnumbers, publishers):
"""Given the indices of the titles and reportnumbers that have been
recognised within a reference line, create a dictionary keyed by
the replacement position in the line, where the value for each
key is a string describing the type of item replaced at that
position in the line.
The description strings are:
'title' - indicating that the replacement is a
periodical title
'reportnumber' - indicating that the replacement is a
preprint report number.
@param titles: (list) of locations in the string at which
periodical titles were found.
@param reportnumbers: (list) of locations in the string at which
reportnumbers were found.
@return: (dictionary) of replacement types at various locations
within the string.
"""
rep_types = {}
for item_idx in titles:
rep_types[item_idx] = "journal"
for item_idx in reportnumbers:
rep_types[item_idx] = "reportnumber"
for item_idx in publishers:
rep_types[item_idx] = "publisher"
return rep_types
def account_for_stripped_whitespace(spaces_keys,
removed_spaces,
replacement_types,
len_reportnums,
journals_matches,
replacement_index):
"""To build a processed (MARC XML) reference line in which the
recognised citations such as standardised periodical TITLEs and
REPORT-NUMBERs have been marked up, it is necessary to read from
the reference line BEFORE all punctuation was stripped and it was
made into upper-case. The indices of the cited items in this
'original line', however, will be different to those in the
'working-line', in which punctuation and multiple-spaces were
stripped out. For example, the following reading-line:
[26] E. Witten and S.-T. Yau, hep-th/9910245.
...becomes (after punctuation and multiple white-space stripping):
[26] E WITTEN AND S T YAU HEP TH/9910245
It can be seen that the report-number citation (hep-th/9910245) is
at a different index in the two strings. When refextract searches
for this citation, it uses the 2nd string (i.e. that which is
capitalised and has no punctuation). When it builds the MARC XML
representation of the reference line, however, it needs to read from
the first string. It must therefore consider the whitespace,
punctuation, etc that has been removed, in order to get the correct
index for the cited item. This function accounts for the stripped
characters before a given TITLE or REPORT-NUMBER index.
@param spaces_keys: (list) - the indices at which spaces were
removed from the reference line.
@param removed_spaces: (dictionary) - keyed by the indices at which
spaces were removed from the line, the values are the number of
spaces actually removed from that position.
So, for example, "3 spaces were removed from position 25 in
the line."
@param replacement_types: (dictionary) - at each 'replacement_index'
in the line, the of replacement to make (title or reportnumber).
@param len_reportnums: (dictionary) - the lengths of the REPORT-
NUMBERs matched at the various indices in the line.
@param len_titles: (dictionary) - the lengths of the various
TITLEs matched at the various indices in the line.
@param replacement_index: (integer) - the index in the working line
of the identified TITLE or REPORT-NUMBER citation.
@return: (tuple) containing 2 elements:
+ the true replacement index of a replacement in
the reading line;
+ any extras to add into the replacement index;
"""
extras = 0
true_replacement_index = replacement_index
spare_replacement_index = replacement_index
for space in spaces_keys:
if space < true_replacement_index:
# There were spaces stripped before the current replacement
# Add the number of spaces removed from this location to the
# current replacement index:
true_replacement_index += removed_spaces[space]
spare_replacement_index += removed_spaces[space]
elif space >= spare_replacement_index and \
replacement_types[replacement_index] == u"journal" and \
space < (spare_replacement_index +
len(journals_matches[replacement_index])):
# A periodical title is being replaced. Account for multi-spaces
# that may have been stripped from the title before its
# recognition:
spare_replacement_index += removed_spaces[space]
extras += removed_spaces[space]
elif space >= spare_replacement_index and \
replacement_types[replacement_index] == u"reportnumber" and \
space < (spare_replacement_index +
len_reportnums[replacement_index]):
# An institutional preprint report-number is being replaced.
# Account for multi-spaces that may have been stripped from it
# before its recognition:
spare_replacement_index += removed_spaces[space]
extras += removed_spaces[space]
# return the new values for replacement indices with stripped
# whitespace accounted for:
return true_replacement_index, extras
def strip_tags(line):
# Firstly, go through and change ALL TAGS and their contents to underscores
# author content can be checked for underscores later on
# Note that we don't have embedded tags this is why
# we can do this
re_tag = re.compile(ur'<cds\.[A-Z]+>[^<]*</cds\.[A-Z]+>|<cds\.[A-Z]+ />',
re.UNICODE)
for m in re_tag.finditer(line):
chars_count = m.end() - m.start()
line = re_tag.sub('_'*chars_count, line, count=1)
return line
def identify_and_tag_collaborations(line, collaborations_kb):
"""Given a line where Authors have been tagged, and all other tags
and content has been replaced with underscores, go through and try
to identify extra items of data which should be placed into 'h'
subfields.
Later on, these tagged pieces of information will be merged into
the content of the most recently found author. This is separated
from the author tagging procedure since separate tags can be used,
which won't influence the reference splitting heuristics
(used when looking at mulitple <AUTH> tags in a line).
"""
for dummy_collab, re_collab in collaborations_kb.iteritems():
matches = re_collab.finditer(strip_tags(line))
for match in reversed(list(matches)):
line = line[:match.start()] \
+ CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION \
+ match.group(1).strip(".,:;- [](){}") \
+ CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION \
+ line[match.end():]
return line
def identify_and_tag_authors(line, authors_kb):
"""Given a reference, look for a group of author names,
place tags around the author group, return the newly tagged line.
"""
re_auth, re_auth_near_miss = get_author_regexps()
# Replace authors which do not convert well from utf-8
for pattern, repl in authors_kb:
line = line.replace(pattern, repl)
output_line = line
# We matched authors here
line = strip_tags(output_line)
matched_authors = list(re_auth.finditer(line))
# We try to have better results by unidecoding
if UNIDECODE_AVAILABLE:
unidecoded_line = strip_tags(unidecode(output_line))
matched_authors_unidecode = list(re_auth.finditer(unidecoded_line))
if len(matched_authors_unidecode) > len(matched_authors):
output_line = unidecode(output_line)
matched_authors = matched_authors_unidecode
# If there is at least one matched author group
if matched_authors:
matched_positions = []
preceeding_text_string = line
preceeding_text_start = 0
for auth_no, match in enumerate(matched_authors):
# Only if there are no underscores or closing arrows found in the matched author group
# This must be checked for here, as it cannot be applied to the re without clashing with
# other Unicode characters
if line[match.start():match.end()].find("_") == -1:
# Has the group with name 'et' (for 'et al') been found in the pattern?
# Has the group with name 'es' (for ed. before the author) been found in the pattern?
# Has the group with name 'ee' (for ed. after the author) been found in the pattern?
matched_positions.append({
'start' : match.start(),
'end' : match.end(),
'etal' : match.group('et') or match.group('et2'),
'ed_start' : match.group('es'),
'ed_end' : match.group('ee'),
'multi_auth' : match.group('multi_auth'),
'multi_surs' : match.group('multi_surs'),
'text_before' : preceeding_text_string[preceeding_text_start:match.start()],
'auth_no' : auth_no,
'author_names': match.group('author_names')
})
# Save the end of the match, from where to snip the misc text found before an author match
preceeding_text_start = match.end()
# Work backwards to avoid index problems when adding AUTH tags
matched_positions.reverse()
for m in matched_positions:
dump_in_misc = False
start = m['start']
end = m['end']
# Check the text before the current match to see if it has a bad 'et al'
lower_text_before = m['text_before'].strip().lower()
for e in etal_matches:
if lower_text_before.endswith(e):
## If so, this author match is likely to be a bad match on a missed title
dump_in_misc = True
break
# An AND found here likely indicates a missed author before this text
# Thus, triggers weaker author searching, within the previous misc text
# (Check the text before the current match to see if it has a bad 'and')
# A bad 'and' will only be denoted as such if there exists only one author after it
# and the author group is legit (not to be dumped in misc)
if not dump_in_misc and not (m['multi_auth'] or m['multi_surs']) \
and (lower_text_before.endswith(' and')):
# Search using a weaker author pattern to try and find the missed author(s) (cut away the end 'and')
weaker_match = re_auth_near_miss.match(m['text_before'])
if weaker_match and not (weaker_match.group('es') or weaker_match.group('ee')):
# Change the start of the author group to include this new author group
start = start - (len(m['text_before']) - weaker_match.start())
# Still no match, do not add tags for this author match.. dump it into misc
else:
dump_in_misc = True
add_to_misc = ""
# If a semi-colon was found at the end of this author group, keep it in misc
# so that it can be looked at for splitting heurisitics
if len(output_line) > m['end']:
if output_line[m['end']].strip(" ,.") == ';':
add_to_misc = ';'
# Standardize eds. notation
tmp_output_line = re.sub(re_ed_notation, '(ed.)',
output_line[start:end], re.IGNORECASE)
# Standardize et al. notation
tmp_output_line = re.sub(re_etal, 'et al.',
tmp_output_line, re.IGNORECASE)
# Strip
tmp_output_line = tmp_output_line.lstrip('.').strip(",:;- [](")
if not tmp_output_line.endswith('(ed.)'):
tmp_output_line = tmp_output_line.strip(')')
# ONLY wrap author data with tags IF there is no evidence that it is an
# ed. author. (i.e. The author is not referred to as an editor)
# Does this author group string have 'et al.'?
if m['etal'] and not (m['ed_start'] or m['ed_end'] or dump_in_misc):
output_line = output_line[:start] \
+ "<cds.AUTHetal>" \
+ tmp_output_line \
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL \
+ add_to_misc \
+ output_line[end:]
elif not (m['ed_start'] or m['ed_end'] or dump_in_misc):
# Insert the std (standard) tag
output_line = output_line[:start] \
+ "<cds.AUTHstnd>" \
+ tmp_output_line \
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND \
+ add_to_misc \
+ output_line[end:]
# Apply the 'include in $h' method to author groups marked as editors
elif m['ed_start'] or m['ed_end']:
ed_notation = " (eds.)"
# Standardize et al. notation
tmp_output_line = re.sub(re_etal, 'et al.',
m['author_names'], re.IGNORECASE)
# remove any characters which denote this author group
# to be editors, just take the
# author names, and append '(ed.)'
output_line = output_line[:start] \
+ "<cds.AUTHincl>" \
+ tmp_output_line.strip(",:;- [](") \
+ ed_notation \
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL \
+ add_to_misc \
+ output_line[end:]
return output_line
def sum_2_dictionaries(dicta, dictb):
"""Given two dictionaries of totals, where each total refers to a key
in the dictionary, add the totals.
E.g.: dicta = { 'a' : 3, 'b' : 1 }
dictb = { 'a' : 1, 'c' : 5 }
dicta + dictb = { 'a' : 4, 'b' : 1, 'c' : 5 }
@param dicta: (dictionary)
@param dictb: (dictionary)
@return: (dictionary) - the sum of the 2 dictionaries
"""
dict_out = dicta.copy()
for key in dictb.keys():
if 'key' in dict_out:
# Add the sum for key in dictb to that of dict_out:
dict_out[key] += dictb[key]
else:
# the key is not in the first dictionary - add it directly:
dict_out[key] = dictb[key]
return dict_out
def identify_ibids(line):
"""Find IBIDs within the line, record their position and length,
and replace them with underscores.
@param line: (string) the working reference line
@return: (tuple) containing 2 dictionaries and a string:
Dictionary: matched IBID text: (Key: position of IBID in
line; Value: matched IBID text)
String: working line with matched IBIDs removed
"""
ibid_match_txt = {}
# Record details of each matched ibid:
for m_ibid in re_ibid.finditer(line):
ibid_match_txt[m_ibid.start()] = m_ibid.group(0)
# Replace matched text in line with underscores:
line = line[0:m_ibid.start()] + \
"_" * len(m_ibid.group(0)) + \
line[m_ibid.end():]
return ibid_match_txt, line
def find_all(string, sub):
listindex = []
offset = 0
i = string.find(sub, offset)
while i >= 0:
listindex.append(i)
i = string.find(sub, i + 1)
return listindex
def find_numeration(line):
"""Given a reference line, attempt to locate instances of citation
'numeration' in the line.
@param line: (string) the reference line.
@return: (string) the reference line after numeration has been checked
and possibly recognized/marked-up.
"""
patterns = (
# vol,page,year
re_numeration_vol_page_yr,
re_numeration_vol_nucphys_page_yr,
re_numeration_nucphys_vol_page_yr,
# With sub volume
re_numeration_vol_subvol_nucphys_yr_page,
re_numeration_vol_nucphys_yr_subvol_page,
# vol,year,page
re_numeration_vol_yr_page,
re_numeration_nucphys_vol_yr_page,
re_numeration_vol_nucphys_series_yr_page,
# vol,page,year
re_numeration_vol_series_nucphys_page_yr,
re_numeration_vol_nucphys_series_page_yr,
# year,vol,page
re_numeration_yr_vol_page,
)
for pattern in patterns:
match = pattern.match(line)
if match:
info = match.groupdict()
series = info.get('series', None)
if not series:
series = extract_series_from_volume(info['vol'])
if not info['vol_num']:
info['vol_num'] = info['vol_num_alt']
if not info['vol_num']:
info['vol_num'] = info['vol_num_alt2']
return {'year': info.get('year', None),
'series': series,
'volume': info['vol_num'],
'page': info['page'],
'len': match.end()}
return None
def identify_journals(line, kb_journals):
"""Attempt to identify all periodical titles in a reference line.
Titles will be identified, their information (location in line,
length in line, and non-standardised version) will be recorded,
and they will be replaced in the working line by underscores.
@param line: (string) - the working reference line.
@param periodical_title_search_kb: (dictionary) - contains the
regexp patterns used to search for a non-standard TITLE in the
working reference line. Keyed by the TITLE string itself.
@param periodical_title_search_keys: (list) - contains the non-
standard periodical TITLEs to be searched for in the line. This
list of titles has already been ordered and is used to force
the order of searching.
@return: (tuple) containing 4 elements:
+ (dictionary) - the lengths of all titles
matched at each given index
within the line.
+ (dictionary) - the text actually matched for
each title at each given
index within the line.
+ (string) - the working line, with the
titles removed from it and
replaced by underscores.
+ (dictionary) - the totals for each bad-title
found in the line.
"""
periodical_title_search_kb = kb_journals[0]
periodical_title_search_keys = kb_journals[2]
title_matches = {} # the text matched at the given line
# location (i.e. the title itself)
titles_count = {} # sum totals of each 'bad title found in
# line.
# Begin searching:
for title in periodical_title_search_keys:
# search for all instances of the current periodical title
# in the line:
# for each matched periodical title:
for title_match in periodical_title_search_kb[title].finditer(line):
if title not in titles_count:
# Add this title into the titles_count dictionary:
titles_count[title] = 1
else:
# Add 1 to the count for the given title:
titles_count[title] += 1
# record the details of this title match:
# record the match length:
title_matches[title_match.start()] = title
len_to_replace = len(title)
# replace the matched title text in the line it n * '_',
# where n is the length of the matched title:
line = u"".join((line[:title_match.start()],
u"_" * len_to_replace,
line[title_match.start() + len_to_replace:]))
# return recorded information about matched periodical titles,
# along with the newly changed working line:
return title_matches, line, titles_count
def identify_report_numbers(line, kb_reports):
"""Attempt to identify all preprint report numbers in a reference
line.
Report numbers will be identified, their information (location
in line, length in line, and standardised replacement version)
will be recorded, and they will be replaced in the working-line
by underscores.
@param line: (string) - the working reference line.
@param preprint_repnum_search_kb: (dictionary) - contains the
regexp patterns used to identify preprint report numbers.
@param preprint_repnum_standardised_categs: (dictionary) -
contains the standardised 'category' of a given preprint report
number.
@return: (tuple) - 3 elements:
* a dictionary containing the lengths in the line of the
matched preprint report numbers, keyed by the index at
which each match was found in the line.
* a dictionary containing the replacement strings (standardised
versions) of preprint report numbers that were matched in
the line.
* a string, that is the new version of the working reference
line, in which any matched preprint report numbers have been
replaced by underscores.
Returned tuple is therefore in the following order:
(matched-reportnum-lengths, matched-reportnum-replacements,
working-line)
"""
def _by_len(a, b):
"""Comparison function used to sort a list by the length of the
strings in each element of the list.
"""
if len(a[1]) < len(b[1]):
return 1
elif len(a[1]) == len(b[1]):
return 0
else:
return -1
repnum_matches_matchlen = {} # info about lengths of report numbers
# matched at given locations in line
repnum_matches_repl_str = {} # standardised report numbers matched
# at given locations in line
repnum_search_kb, repnum_standardised_categs = kb_reports
repnum_categs = repnum_standardised_categs.keys()
repnum_categs.sort(_by_len)
# Handle CERN/LHCC/98-013
line = line.replace('/', ' ')
# try to match preprint report numbers in the line:
for categ in repnum_categs:
# search for all instances of the current report
# numbering style in the line:
repnum_matches_iter = repnum_search_kb[categ].finditer(line)
# for each matched report number of this style:
for repnum_match in repnum_matches_iter:
# Get the matched text for the numeration part of the
# preprint report number:
numeration_match = repnum_match.group('numn')
# clean/standardise this numeration text:
numeration_match = numeration_match.replace(" ", "-")
numeration_match = re_multiple_hyphens.sub("-", numeration_match)
numeration_match = numeration_match.replace("/-", "/")
numeration_match = numeration_match.replace("-/", "/")
numeration_match = numeration_match.replace("-/-", "/")
# replace the found preprint report number in the
# string with underscores
# (this will replace chars in the lower-cased line):
line = line[0:repnum_match.start(1)] \
+ "_"*len(repnum_match.group(1)) + line[repnum_match.end(1):]
# record the information about the matched preprint report number:
# total length in the line of the matched preprint report number:
repnum_matches_matchlen[repnum_match.start(1)] = \
len(repnum_match.group(1))
# standardised replacement for the matched preprint report number:
repnum_matches_repl_str[repnum_match.start(1)] = \
repnum_standardised_categs[categ] \
+ numeration_match
# return recorded information about matched report numbers, along with
# the newly changed working line:
return repnum_matches_matchlen, repnum_matches_repl_str, line
def identify_publishers(line, kb_publishers):
matches_repl = {} # standardised report numbers matched
# at given locations in line
for abbrev, info in kb_publishers.iteritems():
for match in info['pattern'].finditer(line):
# record the matched non-standard version of the publisher:
matches_repl[match.start(0)] = abbrev
return matches_repl
def identify_and_tag_URLs(line):
"""Given a reference line, identify URLs in the line, record the
information about them, and replace them with a "<cds.URL />" tag.
URLs are identified in 2 forms:
+ Raw: http://invenio-software.org/
+ HTML marked-up: <a href="http://invenio-software.org/">CERN Document
Server Software Consortium</a>
These URLs are considered to have 2 components: The URL itself
(url string); and the URL description. The description is effectively
the text used for the created Hyperlink when the URL is marked-up
in HTML. When an HTML marked-up URL has been recognised, the text
between the anchor tags is therefore taken as the URL description.
In the case of a raw URL recognition, however, the URL itself will
also be used as the URL description.
For example, in the following reference line:
[1] See <a href="http://invenio-software.org/">CERN Document Server
Software Consortium</a>.
...the URL string will be "http://invenio-software.org/" and the URL
description will be
"CERN Document Server Software Consortium".
The line returned from this function will be:
[1] See <cds.URL />
In the following line, however:
[1] See http //invenio-software.org/ for more details.
...the URL string will be "http://invenio-software.org/" and the URL
description will also be "http://invenio-software.org/".
The line returned will be:
[1] See <cds.URL /> for more details.
@param line: (string) the reference line in which to search for URLs.
@return: (tuple) - containing 2 items:
+ the line after URLs have been recognised and removed;
+ a list of 2-item tuples where each tuple represents a recognised URL
and its description:
[(url, url-description), (url, url-description), ... ]
@Exceptions raised:
+ an IndexError if there is a problem with the number of URLs
recognised (this should not happen.)
"""
# Take a copy of the line:
line_pre_url_check = line
# Dictionaries to record details of matched URLs:
found_url_full_matchlen = {}
found_url_urlstring = {}
found_url_urldescr = {}
# List to contain details of all matched URLs:
identified_urls = []
# Attempt to identify and tag all HTML-MARKED-UP URLs in the line:
m_tagged_url_iter = re_html_tagged_url.finditer(line)
for m_tagged_url in m_tagged_url_iter:
startposn = m_tagged_url.start() # start position of matched URL
endposn = m_tagged_url.end() # end position of matched URL
matchlen = len(m_tagged_url.group(0)) # total length of URL match
found_url_full_matchlen[startposn] = matchlen
found_url_urlstring[startposn] = m_tagged_url.group('url')
found_url_urldescr[startposn] = m_tagged_url.group('desc')
# temporarily replace the URL match with underscores so that
# it won't be re-found
line = line[0:startposn] + u"_"*matchlen + line[endposn:]
# Attempt to identify and tag all RAW (i.e. not
# HTML-marked-up) URLs in the line:
m_raw_url_iter = re_raw_url.finditer(line)
for m_raw_url in m_raw_url_iter:
startposn = m_raw_url.start() # start position of matched URL
endposn = m_raw_url.end() # end position of matched URL
matchlen = len(m_raw_url.group(0)) # total length of URL match
matched_url = m_raw_url.group('url')
if len(matched_url) > 0 and matched_url[-1] in (".", ","):
# Strip the full-stop or comma from the end of the url:
matched_url = matched_url[:-1]
found_url_full_matchlen[startposn] = matchlen
found_url_urlstring[startposn] = matched_url
found_url_urldescr[startposn] = matched_url
# temporarily replace the URL match with underscores
# so that it won't be re-found
line = line[0:startposn] + u"_"*matchlen + line[endposn:]
# Now that all URLs have been identified, insert them
# back into the line, tagged:
found_url_positions = found_url_urlstring.keys()
found_url_positions.sort()
found_url_positions.reverse()
for url_position in found_url_positions:
line = line[0:url_position] + "<cds.URL />" \
+ line[url_position + found_url_full_matchlen[url_position]:]
# The line has been rebuilt. Now record the information about the
# matched URLs:
found_url_positions = found_url_urlstring.keys()
found_url_positions.sort()
for url_position in found_url_positions:
identified_urls.append((found_url_urlstring[url_position],
found_url_urldescr[url_position]))
# Somehow the number of URLs found doesn't match the number of
# URLs recorded in "identified_urls". Raise an IndexError.
msg = """Error: The number of URLs found in the reference line """ \
"""does not match the number of URLs recorded in the """ \
"""list of identified URLs!\nLine pre-URL checking: %s\n""" \
"""Line post-URL checking: %s\n""" \
% (line_pre_url_check, line)
assert len(identified_urls) == len(found_url_positions), msg
# return the line containing the tagged URLs:
return line, identified_urls
def identify_and_tag_DOI(line):
"""takes a single citation line and attempts to locate any DOI references.
DOI references are recognised in both http (url) format and also the
standard DOI notation (DOI: ...)
@param line: (string) the reference line in which to search for DOI's.
@return: the tagged line and a list of DOI strings (if any)
"""
# Used to hold the DOI strings in the citation line
doi_strings = []
# Run the DOI pattern on the line, returning the re.match objects
matched_doi = re_doi.finditer(line)
# For each match found in the line
for match in reversed(list(matched_doi)):
# Store the start and end position
start = match.start()
end = match.end()
# Get the actual DOI string (remove the url part of the doi string)
doi_phrase = match.group(6)
# Replace the entire matched doi with a tag
line = line[0:start] + "<cds.DOI />" + line[end:]
# Add the single DOI string to the list of DOI strings
doi_strings.append(doi_phrase)
doi_strings.reverse()
return line, doi_strings
diff --git a/modules/docextract/lib/refextract_unit_tests.py b/modules/docextract/lib/refextract_unit_tests.py
index c9dc6ed3c..d1d1376de 100644
--- a/modules/docextract/lib/refextract_unit_tests.py
+++ b/modules/docextract/lib/refextract_unit_tests.py
@@ -1,309 +1,407 @@
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2010, 2011, 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
The Refextract unit test suite
The tests will not modifiy the database.
"""
from invenio.testutils import InvenioTestCase
import re
from invenio.testutils import make_test_suite, run_test_suite
# Import the minimal necessary methods and variables needed to run Refextract
from invenio.docextract_utils import setup_loggers
+
from invenio.refextract_tag import identify_ibids, \
find_numeration, \
find_numeration_more
+
+from invenio.refextract_tag import identify_ibids, tag_arxiv
from invenio import refextract_re
from invenio.refextract_find import get_reference_section_beginning
from invenio.refextract_api import search_from_reference
from invenio.refextract_text import rebuild_reference_lines
class ReTest(InvenioTestCase):
def setUp(self):
setup_loggers(verbosity=1)
def test_word(self):
r = refextract_re._create_regex_pattern_add_optional_spaces_to_word_characters('ABC')
self.assertEqual(r, ur'A\s*B\s*C\s*')
def test_reference_section_title_pattern(self):
r = refextract_re.get_reference_section_title_patterns()
self.assert_(len(r) > 2)
def test_get_reference_line_numeration_marker_patterns(self):
r = refextract_re.get_reference_line_numeration_marker_patterns()
self.assert_(len(r) > 2)
def test_get_reference_line_marker_pattern(self):
r = refextract_re.get_reference_line_marker_pattern('ABC')
self.assertNotEqual(r.pattern.find('ABC'), -1)
def test_get_post_reference_section_title_patterns(self):
r = refextract_re.get_post_reference_section_title_patterns()
self.assert_(len(r) > 2)
def test_get_post_reference_section_keyword_patterns(self):
r = refextract_re.get_post_reference_section_keyword_patterns()
self.assert_(len(r) > 2)
def test_regex_match_list(self):
s = 'ABC'
m = refextract_re.regex_match_list(s, [
re.compile('C.C'),
re.compile('A.C')
])
self.assert_(m)
m = refextract_re.regex_match_list(s, [
re.compile('C.C')
])
self.assertEqual(m, None)
class IbidTest(InvenioTestCase):
"""Testing output of refextract"""
def setUp(self):
setup_loggers(verbosity=1)
def test_identify_ibids_empty(self):
r = identify_ibids("")
self.assertEqual(r, ({}, ''))
def test_identify_ibids_simple(self):
ref_line = u"""[46] E. Schrodinger, Sitzungsber. Preuss. Akad. Wiss. Phys. Math. Kl. 24, 418(1930); ibid, 3, 1(1931)"""
r = identify_ibids(ref_line.upper())
self.assertEqual(r, ({85: u'IBID'}, u'[46] E. SCHRODINGER, SITZUNGSBER. PREUSS. AKAD. WISS. PHYS. MATH. KL. 24, 418(1930); ____, 3, 1(1931)'))
class FindNumerationTest(InvenioTestCase):
def setUp(self):
setup_loggers(verbosity=1)
def test_vol_page_year(self):
"<vol>, <page> (<year>)"
ref_line = u"""24, 418 (1930)"""
r = find_numeration(ref_line)
self.assertEqual(r['volume'], u"24")
self.assertEqual(r['year'], u"1930")
self.assertEqual(r['page'], u"418")
def test_vol_year_page(self):
"<vol>, (<year>) <page> "
ref_line = u"""24, (1930) 418"""
r = find_numeration(ref_line)
self.assertEqual(r['volume'], u"24")
self.assertEqual(r['year'], u"1930")
self.assertEqual(r['page'], u"418")
def test_year_title_volume_page(self):
"<year>, <title> <vol> <page> "
ref_line = u"""1930 <cds.JOURNAL>J.Phys.</cds.JOURNAL> 24, 418"""
r = find_numeration_more(ref_line)
self.assertEqual(r['volume'], u"24")
self.assertEqual(r['year'], u"1930")
self.assertEqual(r['page'], u"418")
class FindSectionTest(InvenioTestCase):
def setUp(self):
setup_loggers(verbosity=1)
def test_simple(self):
sect = get_reference_section_beginning([
"Hello",
"References",
"[1] Ref1"
])
self.assertEqual(sect, {
'marker': '[1]',
'marker_pattern': u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])',
'start_line': 1,
'title_string': 'References',
'title_marker_same_line': False,
'how_found_start': 1,
})
def test_no_section(self):
sect = get_reference_section_beginning("")
self.assertEqual(sect, None)
def test_no_title_via_brackets(self):
sect = get_reference_section_beginning([
"Hello",
"[1] Ref1"
"[2] Ref2"
])
self.assertEqual(sect, {
'marker': '[1]',
'marker_pattern': u'(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\]))',
'start_line': 1,
'title_string': None,
'title_marker_same_line': False,
'how_found_start': 2,
})
def test_no_title_via_dots(self):
sect = get_reference_section_beginning([
"Hello",
"1. Ref1"
"2. Ref2"
])
self.assertEqual(sect, {
'marker': '1.',
'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\.))',
'start_line': 1,
'title_string': None,
'title_marker_same_line': False,
'how_found_start': 3,
})
def test_no_title_via_numbers(self):
sect = get_reference_section_beginning([
"Hello",
"1 Ref1"
"2 Ref2"
])
self.assertEqual(sect, {
'marker': '1',
'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))',
'start_line': 1,
'title_string': None,
'title_marker_same_line': False,
'how_found_start': 4,
})
def test_no_title_via_numbers2(self):
sect = get_reference_section_beginning([
"Hello",
"1",
"Ref1",
"(3)",
"2",
"Ref2",
])
self.assertEqual(sect, {
'marker': '1',
'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))',
'start_line': 1,
'title_string': None,
'title_marker_same_line': False,
'how_found_start': 4,
})
class SearchTest(InvenioTestCase):
def setUp(self):
setup_loggers(verbosity=9)
from invenio import refextract_kbs
self.old_override = refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE
refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE = {}
def tearDown(self):
from invenio import refextract_kbs
refextract_kbs.CFG_REFEXTRACT_KBS_OVERRIDE = self.old_override
def test_not_recognized(self):
field, pattern = search_from_reference('[1] J. Mars, oh hello')
self.assertEqual(field, '')
self.assertEqual(pattern, '')
def test_report(self):
field, pattern = search_from_reference('[1] J. Mars, oh hello, [hep-ph/0104088]')
self.assertEqual(field, 'report')
self.assertEqual(pattern, 'hep-ph/0104088')
def test_journal(self):
field, pattern = search_from_reference('[1] J. Mars, oh hello, Nucl.Phys. B76 (1974) 477-482')
self.assertEqual(field, 'journal')
self.assert_('Nucl' in pattern)
self.assert_('B76' in pattern)
self.assert_('477' in pattern)
class RebuildReferencesTest(unittest.TestCase):
def setUp(self):
setup_loggers(verbosity=9)
def test_simple(self):
marker_pattern = ur"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
refs = [
u"[1] hello",
u"hello2",
u"[2] foo",
]
rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
self.assertEqual(rebuilt_refs, [
u"[1] hello hello2",
u"[2] foo",
])
# def test_pagination_removal(self):
# marker_pattern = ur"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
# refs = [
# u"[1] hello",
# u"hello2",
# u"[42]",
# u"[2] foo",
# ]
# rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
# self.assertEqual(rebuilt_refs, [
# u"[1] hello hello2",
# u"[2] foo",
# ])
def test_pagination_non_removal(self):
marker_pattern = ur"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
refs = [
u"[1] hello",
u"hello2",
u"[2]",
u"foo",
]
rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
self.assertEqual(rebuilt_refs, [
u"[1] hello hello2",
u"[2] foo",
])
def test_2_lines_together(self):
marker_pattern = ur"\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
refs = [
u"[1] hello",
u"hello2 [2] foo",
]
rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
self.assertEqual(rebuilt_refs, [
u"[1] hello hello2",
u"[2] foo",
])
print 'rebuilt_refs', repr(rebuilt_refs)
+class tagArxivTest(unittest.TestCase):
+ def setUp(self):
+ setup_loggers(verbosity=1)
+
+ def test_4_digits(self):
+ ref_line = u"""{any prefix}arXiv:1003.1111{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1003.1111</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_suffix(self):
+ ref_line = u"""{any prefix}arXiv:1104.2222 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1104.2222 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits(self):
+ ref_line = u"""{any prefix}arXiv:1303.33333{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1303.33333</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_2012(self):
+ ref_line = u"""{any prefix}arXiv:1203.33333{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}arXiv:1203.33333{any postfix}")
+
+ def test_5_digits_suffix(self):
+ ref_line = u"""{any prefix}arXiv:1304.44444 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1304.44444 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_version(self):
+ ref_line = u"""{any prefix}arXiv:1003.1111v9{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1003.1111</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_suffix_version(self):
+ ref_line = u"""{any prefix}arXiv:1104.2222v9 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1104.2222 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_version(self):
+ ref_line = u"""{any prefix}arXiv:1303.33333v9{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1303.33333</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_suffix_version(self):
+ ref_line = u"""{any prefix}arXiv:1304.44444v9 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1304.44444 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_new(self):
+ ref_line = u"""{any prefix}9910.1234{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:9910.1234</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_suffix_new(self):
+ ref_line = u"""{any prefix}9910.1234 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:9910.1234 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_new(self):
+ ref_line = u"""{any prefix}1310.12345{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1310.12345</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_suffix_new(self):
+ ref_line = u"""{any prefix}1310.12345 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1310.12345 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_version_new(self):
+ ref_line = u"""{any prefix}9910.1234v9{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:9910.1234</cds.REPORTNUMBER>{any postfix}")
+
+ def test_4_digits_suffix_version_new(self):
+ ref_line = u"""{any prefix}9910.1234v9 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:9910.1234 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_version_new(self):
+ ref_line = u"""{any prefix}1310.12345v9{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1310.12345</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_suffix_version_new(self):
+ ref_line = u"""{any prefix}1310.12345v9 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}<cds.REPORTNUMBER>arXiv:1310.12345 [physics.ins-det]</cds.REPORTNUMBER>{any postfix}")
+
+ def test_5_digits_suffix_version_new_2012(self):
+ ref_line = u"""{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"""
+ r = tag_arxiv(ref_line)
+ self.assertEqual(r.strip(': '), u"{any prefix}1210.12345v9 [physics.ins-det]{any postfix}")
+
+
TEST_SUITE = make_test_suite(ReTest,
IbidTest,
FindNumerationTest,
FindSectionTest,
SearchTest,
RebuildReferencesTest)
if __name__ == '__main__':
run_test_suite(TEST_SUITE)

Event Timeline