refextract_re.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Aug 21, 03:43

refextract_re.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	import re
	from datetime import datetime

	# Sep
	re_sep = ur"\s[,\s:-]\s"
	# Sep or no sep
	re_sep_opt = ur"\s[,\s:-]?\s"

	# Pattern for PoS journal

	# e.g. 2006
	re_pos_year_num = ur'(?:19\|20)\d{2}'
	re_pos_year = ur'(?P<year>(' \
	+ ur'\s' + re_pos_year_num + ur'\s' \
	+ ur'\|' \
	+ ur'$' + re_pos_year_num + '$' \
	+ ur'))'
	# e.g. LAT2007
	re_pos_volume = ur'(?P<volume_name>\w{1,10})' + re_sep_opt + ur'(?P<volume_num>(?:19\|20)\d{2})'
	# e.g. (LAT2007)
	re_pos_volume_par = ur'$' + re_pos_volume + ur'$'
	# e.g. 20
	re_pos_page = ur'(?P<page>\d{1,4})'
	re_pos_title = ur'POS'

	re_pos_patterns = [
	re_pos_title + re_sep_opt + re_pos_year + re_sep + re_pos_volume + re_sep + re_pos_page,
	re_pos_title + re_sep + re_pos_volume + re_sep_opt + re_pos_year + re_sep_opt + re_pos_page,
	re_pos_title + re_sep + re_pos_volume + re_sep + re_pos_page + re_sep_opt + re_pos_year,
	re_pos_title + re_sep_opt + re_pos_volume_par + re_sep_opt + re_pos_page,
	]
	re_opts = re.VERBOSE \| re.UNICODE \| re.IGNORECASE


	def compute_pos_patterns(patterns):
	return [re.compile(p, re_opts) for p in patterns]
	re_pos = compute_pos_patterns(re_pos_patterns)

	# Pattern for arxiv numbers
	# arxiv 9910-1234v9 [physics.ins-det]
	re_arxiv = re.compile(ur"""
	ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
	[\s.-](?P<num>\d{4})(?:[\s-]V(?P<version>\d))?
	\s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE \| re.UNICODE \| re.IGNORECASE)

	# Pattern for arxiv numbers catchup
	# arxiv:9910-123 [physics.ins-det]
	RE_ARXIV_CATCHUP = re.compile(ur"""
	ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
	[\s.-]*(?P<num>\d{3})
	\s*\[(?P<suffix>[A-Z.-]+)\]""", re.VERBOSE \| re.UNICODE \| re.IGNORECASE)

	# Patterns for ATLAS CONF report numbers
	RE_ATLAS_CONF_PRE_2010 = re.compile(
	ur'(?<!\w:)ATL(AS)?-CONF-(?P<code>(?:200\d\|99)-\d{3})(?![\w\d])')
	RE_ATLAS_CONF_POST_2010 = re.compile(
	ur'(?<!\w:)ATL(AS)?-CONF-(?P<code>20[1-9]\d-\d{3})(?![\w\d])')


	# Pattern for old arxiv numbers
	old_arxiv_numbers = ur"[\\|/:\s-]?(?P<num>(?:9[1-9]\|0[0-7])(?:0[1-9]\|1[0-2])\d{3})(?:v\d{1,3})?(?=[^\w\d]\|$)"

	old_arxiv = {
	ur"acc-ph": None,
	ur"astro-ph": None,
	ur"astro-phy": "astro-ph",
	ur"astro-ph\.[a-z]{2}": None,
	ur"atom-ph": None,
	ur"chao-dyn": None,
	ur"chem-ph": None,
	ur"cond-mat": None,
	ur"cs": None,
	ur"cs\.[a-z]{2}": None,
	ur"gr-qc": None,
	ur"hep-ex": None,
	ur"hep-lat": None,
	ur"hep-ph": None,
	ur"hepph": "hep-ph",
	ur"hep-th": None,
	ur"hepth": "hep-th",
	ur"math": None,
	ur"math\.[a-z]{2}": None,
	ur"math-ph": None,
	ur"nlin": None,
	ur"nlin\.[a-z]{2}": None,
	ur"nucl-ex": None,
	ur"nucl-th": None,
	ur"physics": None,
	ur"physics\.acc-ph": None,
	ur"physics\.ao-ph": None,
	ur"physics\.atm-clus": None,
	ur"physics\.atom-ph": None,
	ur"physics\.bio-ph": None,
	ur"physics\.chem-ph": None,
	ur"physics\.class-ph": None,
	ur"physics\.comp-ph": None,
	ur"physics\.data-an": None,
	ur"physics\.ed-ph": None,
	ur"physics\.flu-dyn": None,
	ur"physics\.gen-ph": None,
	ur"physics\.geo-ph": None,
	ur"physics\.hist-ph": None,
	ur"physics\.ins-det": None,
	ur"physics\.med-ph": None,
	ur"physics\.optics": None,
	ur"physics\.plasm-ph": None,
	ur"physics\.pop-ph": None,
	ur"physics\.soc-ph": None,
	ur"physics\.space-ph": None,
	ur"plasm-ph": "physics.plasm-ph",
	ur"q-bio\.[a-z]{2}": None,
	ur"q-fin\.[a-z]{2}": None,
	ur"q-alg": None,
	ur"quant-ph": None,
	ur"quant-phys": "quant-ph",
	ur"solv-int": None,
	ur"stat\.[a-z]{2}": None,
	ur"stat-mech": None,
	ur"dg-ga": None,
	ur"hap-ph": "hep-ph",
	ur"funct-an": None,
	ur"quantph": "quant-ph",
	ur"stro-ph": "astro-ph",
	ur"hepex": "hep-ex",
	ur"math-ag": "math.ag",
	ur"math-dg": "math.dg",
	ur"nuc-th": "nucl-th",
	ur"math-ca": "math.ca",
	ur"nlin-si": "nlin.si",
	ur"quantum-ph": "quant-ph",
	ur"ep-ph": "hep-ph",
	ur"ep-th": "hep-ph",
	ur"ep-ex": "hep-ex",
	ur"hept-h": "hep-th",
	ur"hepp-h": "hep-ph",
	ur"physi-cs": "physics",
	ur"asstro-ph": "astro-ph",
	ur"hep-lt": "hep-lat",
	ur"he-ph": "hep-ph",
	ur"het-ph": "hep-ph",
	ur"mat-ph": "math.th",
	ur"math-th": "math.th",
	ur"ucl-th": "nucl-th",
	ur"nnucl-th": "nucl-th",
	ur"nuclt-th": "nucl-th",
	ur"atro-ph": "astro-ph",
	ur"qnant-ph": "quant-ph",
	ur"astr-ph": "astro-ph",
	ur"math-qa": "math.qa",
	ur"tro-ph": "astro-ph",
	ur"hucl-th": "nucl-th",
	ur"math-gt": "math.gt",
	ur"math-nt": "math.nt",
	ur"math-ct": "math.ct",
	ur"math-oa": "math.oa",
	ur"math-sg": "math.sg",
	ur"math-ap": "math.ap",
	ur"quan-ph": "quant-ph",
	ur"nlin-cd": "nlin.cd",
	ur"math-sp": "math.sp",
	ur"atro-ph": "astro-ph",
	ur"ast-ph": "astro-ph",
	ur"asyro-ph": "astro-ph",
	ur"aastro-ph": "astro-ph",
	ur"astrop-ph": "astro-ph",
	ur"arxivastrop-ph": "astro-ph",
	ur"hept-th": "hep-th",
	ur"quan-th": "quant-th",
	ur"asro-ph": "astro-ph",
	ur"castro-ph": "astro-ph",
	ur"asaastro-ph": "astro-ph",
	ur"hhep-ph": "hep-ph",
	ur"hhep-ex": "hep-ex",
	ur"alg-geom": None,
	ur"nuclth": "nucl-th",
	}


	def compute_arxiv_re(report_pattern, report_number):
	if report_number is None:
	report_number = ur"\g<name>"
	report_re = re.compile(ur"(?<!<cds\.REPORTNUMBER>)(?<!\w)" \
	+ "(?P<name>" + report_pattern + ")" \
	+ old_arxiv_numbers, re.U\|re.I)
	return report_re, report_number

	RE_OLD_ARXIV = [compute_arxiv_re(*i) for i in old_arxiv.iteritems()]


	def compute_years():
	current_year = datetime.now().year
	return '\|'.join(str(y)[2:] for y in xrange(1991, current_year + 1))
	arxiv_years = compute_years()


	def compute_months():
	return '\|'.join(str(y).zfill(2) for y in xrange(1, 13))
	arxiv_months = compute_months()

	re_new_arxiv = re.compile(ur""" # 9910.1234v9 [physics.ins-det]
	(?<!ARXIV:)
	(?P<year>%(arxiv_years)s)
	(?P<month>%(arxiv_months)s)
	\.(?P<num>\d{4})(?:[\s-]*V(?P<version>\d))?
	\s*(?P<suffix>\[[A-Z.-]+\])? """ % {'arxiv_years': arxiv_years,
	'arxiv_months': arxiv_months}, re.VERBOSE \| re.UNICODE \| re.IGNORECASE)

	# Pattern to recognize quoted text:
	re_quoted = re.compile(ur'"(?P<title>[^"]+)"', re.UNICODE)

	# Pattern to recognise an ISBN for a book:
	re_isbn = re.compile(ur"""
	(?:ISBN[-– ]*(?:\|10\|13)\|International Standard Book Number)
	[:\s]*
	(?P<code>[-\-–0-9Xx]{10,25})""", re.VERBOSE \| re.UNICODE)

	# Pattern to recognise a correct knowledge base line:
	re_kb_line = re.compile(ur'^\s(?P<seek>[^\s].)\s---\s(?P<repl>[^\s].)\s$',
	re.UNICODE)

	# precompile some often-used regexp for speed reasons:
	re_regexp_character_class = re.compile(ur'\[[^\]]+\]', re.UNICODE)
	re_multiple_hyphens = re.compile(ur'-{2,}', re.UNICODE)


	# In certain papers, " bf " appears just before the volume of a
	# cited item. It is believed that this is a mistyped TeX command for
	# making the volume "bold" in the paper.
	# The line may look something like this after numeration has been recognised:
	# M. Bauer, B. Stech, M. Wirbel, Z. Phys. bf C : <cds.VOL>34</cds.VOL>
	# <cds.YR>(1987)</cds.YR> <cds.PG>103</cds.PG>
	# The " bf " stops the title from being correctly linked with its series
	# and/or numeration and thus breaks the citation.
	# The pattern below is used to identify this situation and remove the
	# " bf" component:
	re_identify_bf_before_vol = \
	re.compile(ur' bf ((\w )?: \<cds\.VOL\>)', \
	re.UNICODE)

	# Patterns used for creating institutional preprint report-number
	# recognition patterns (used by function "institute_num_pattern_to_regex"):
	# Recognise any character that isn't a->z, A->Z, 0->9, /, [, ], ' ', '"':
	re_report_num_chars_to_escape = \
	re.compile(ur'([^\]A-Za-z0-9\/\[ "])', re.UNICODE)
	# Replace "hello" with hello:
	re_extract_quoted_text = (re.compile(ur'\"([^"]+)\"', re.UNICODE), ur'\g<1>',)
	# Replace / [abcd ]/ with /( [abcd])?/ :
	re_extract_char_class = (re.compile(ur' \[([^\]]+) \]', re.UNICODE), \
	ur'( [\g<1>])?')


	# URL recognition:
	raw_url_pattern = ur"""
	(https?\|s?ftp)://(?:[\w\d_.-])+(?::\d{1,5})?
	(?:/[\w\d_.?=&%~∼-]+)*/?
	"""
	# Stand-alone URL (e.g. http://invenio-software.org/ )
	re_raw_url = \
	re.compile("['\"]?(?P<url>" + raw_url_pattern + ")['\"]?",
	re.UNICODE\|re.I\|re.VERBOSE)

	# HTML marked-up URL (e.g. <a href="http://invenio-software.org/">
	# CERN Document Server Software Consortium</a> )
	re_html_tagged_url = \
	re.compile(ur"""
	# Opening a tag
	<a\s+
	# href attribute
	href\s=\s[\'"]
	# href value
	(?P<url>""" + raw_url_pattern + ur""")
	# href closing quote
	['"]\s*>
	# Tag content
	(?P<desc>[^\<]+)
	# Closing a tag
	</a>""", re.UNICODE\|re.I\|re.VERBOSE)


	# Numeration recognition pattern - used to identify numeration
	# associated with a title when marking the title up into MARC XML:
	vol_tag = ur'<cds\.VOL\>(?P<vol>[^<]+)<\/cds\.VOL>'
	year_tag = ur'\<cds\.YR\>$(?P<yr>[^<]+)$\<\/cds\.YR\>'
	series_tag = ur'(?P<series>(?:[A-H]\|I{1,3}V?\|VI{0,3}))?'
	page_tag = ur'\<cds\.PG\>(?P<pg>[^<]+)\<\/cds\.PG\>'
	re_recognised_numeration_for_title_plus_series = re.compile(
	ur'^\s[\.,]?\s(?:Ser\.\s)?' + series_tag + ur'\s:?\s*' + vol_tag +
	u'\s(?: ' + year_tag + u')?\s(?: ' + page_tag + u')', re.UNICODE)

	# Another numeration pattern. This one is designed to match marked-up
	# numeration that is essentially an IBID, but without the word "IBID". E.g.:
	# <cds.JOURNAL>J. Phys. A</cds.JOURNAL> : <cds.VOL>31</cds.VOL>
	# <cds.YR>(1998)</cds.YR> <cds.PG>2391</cds.PG>; : <cds.VOL>32</cds.VOL>
	# <cds.YR>(1999)</cds.YR> <cds.PG>6119</cds.PG>.
	re_numeration_no_ibid_txt = \
	re.compile(ur"""
	^((\s;\s\|\s+and\s+)(?P<series>(?:[A-H]\|I{1,3}V?\|VI{0,3}))?\s*:?\s ## Leading ; : or " and :", and a possible series letter
	\<cds\.VOL\>(?P<vol>\d+\|(?:\d+\-\d+))\<\/cds\.VOL>\s ## Volume
	\<cds\.YR\>$(?P<yr>[12]\d{3})$\<\/cds\.YR\>\s ## year
	\<cds\.PG\>(?P<pg>[RL]?\d+[c]?)\<\/cds\.PG\>) ## page
	""", re.UNICODE\|re.VERBOSE)

	re_title_followed_by_series_markup_tags = \
	re.compile(ur'(\<cds.JOURNAL(?P<ibid>ibid)?\>([^\<]+)\<\/cds.JOURNAL(?:ibid)?\>\s.?\s\<cds\.SER\>([A-H]\|(I{1,3}V?\|VI{0,3}))\<\/cds\.SER\>)', re.UNICODE)

	re_title_followed_by_implied_series = \
	re.compile(ur'(\<cds.JOURNAL(?P<ibid>ibid)?\>([^\<]+)\<\/cds.JOURNAL(?:ibid)?\>\s.?\s([A-H]\|(I{1,3}V?\|VI{0,3}))\s+:)', re.UNICODE)


	re_punctuation = re.compile(ur'[\.\,\;\'\-]', re.UNICODE)

	# The following pattern is used to recognise "citation items" that have been
	# identified in the line, when building a MARC XML representation of the line:
	re_tagged_citation = re.compile(ur"""
	\<cds\. ## open tag: <cds.
	((?:JOURNAL(?P<ibid>ibid)?) ## a JOURNAL tag
	\|VOL ## or a VOL tag
	\|YR ## or a YR tag
	\|PG ## or a PG tag
	\|REPORTNUMBER ## or a REPORTNUMBER tag
	\|SER ## or a SER tag
	\|URL ## or a URL tag
	\|DOI ## or a DOI tag
	\|QUOTED ## or a QUOTED tag
	\|ISBN ## or a ISBN tag
	\|PUBLISHER ## or a PUBLISHER tag
	\|COLLABORATION ## or a COLLABORATION tag
	\|AUTH(stnd\|etal\|incl)) ## or an AUTH tag
	(\s\/)? ## optional /
	\> ## closing of tag (>)
	""", re.UNICODE\|re.VERBOSE)


	# is there pre-recognised numeration-tagging within a
	# few characters of the start if this part of the line?
	re_tagged_numeration_near_line_start = \
	re.compile(ur'^.{0,4}?<CDS (VOL\|SER)>', re.UNICODE)

	re_ibid = re.compile(ur'(-\|\b)?IBID(EM)?\.?', re.UNICODE)

	re_series_from_numeration = re.compile(ur'^([A-Z])\s[,\s:-]?\s\d+', re.UNICODE)
	re_series_from_numeration_after_volume = re.compile(ur'^\d+\s[,\s:-]?\s([A-Z])', re.UNICODE)

	# Obtain the series character from the standardised title text
	# Only used when no series letter is obtained from numeration matching
	re_series_from_title = re.compile(ur"""
	([^\s].*)
	(?:[\s\.]+(?:(?P<open_bracket>\()\s*[Ss][Ee][Rr]\.)?
	([A-H]\|(I{1,3}V?\|VI{0,3}))
	)?
	(?(open_bracket)\s*\))$ ## Only match the ending bracket if the opening bracket was found""", \
	re.UNICODE\|re.VERBOSE)


	re_wash_volume_tag = (
	re.compile(ur'<cds\.VOL>(\w) (\d+)</cds\.VOL>'),
	ur'<cds.VOL>\g<1>\g<2></cds.VOL>',
	)

	# Roman Numbers
	re_roman_numbers = ur"[XxVvIi]+"

	# Possible beginnings of numeration
	re_start = ur"\s[,\s:-]?\s"

	# Title tag
	re_title_tag = ur"(?P<title_tag><cds\.JOURNAL>[^<]*<\/cds\.JOURNAL>)"

	# Number (within a volume)
	re_volume_sub_number = ur'[Nn][oO°]\.?\s*\d{1,6}'
	re_volume_sub_number_opt = u'(?:' + re_sep + u'(?P<vol_sub>' + \
	re_volume_sub_number + u'))?'

	# Volume
	re_volume_prefix = ur"(?:[Vv]o?l?\.?\|[Nn][oO°]\.?)" # Optional Vol./No.
	re_volume_suffix = ur"(?:\s*$\d{1,2}(?:-\d)?$)?"
	re_volume_num = ur"\d+\|" + "(?:(?<!\w)" + re_roman_numbers + "(?!\w))"
	re_volume_id = ur"(?P<vol>(?:(?:[A-Za-z]\s[,\s:-]?\s)?(?P<vol_num>%(volume_num)s))\|(?:(?P<vol_num_alt>%(volume_num)s)(?:[A-Za-z]))\|(?:(?:[A-Za-z]\s?)?(?P<vol_num_alt2>\d+)\s\-\s(?:[A-Za-z]\s?)?\d+))" % {'volume_num': re_volume_num}
	re_volume_check = ur"(?<![\/\d])"
	re_volume = ur"\b" + u"(?:" + re_volume_prefix + u")?\s*" + re_volume_check + \
	re_volume_id + re_volume_suffix

	# Month
	re_short_month = ur"""(?:(?:
	[Jj]an\|[Ff]eb\|[Mm]ar\|[Aa]pr\|[Mm]ay\|[Jj]un\|
	[Jj]ul\|[Aa]ug\|[Ss]ep\|[Oo]ct\|[Nn]ov\|[Dd]ec
	)\.?)"""

	re_month = ur"""(?:(?:
	[Jj]anuary\|[Ff]ebruary\|[Mm]arch\|[Aa]pril\|[Mm]ay\|[Jj]une\|
	[Jj]uly\|[Aa]ugust\|[Ss]eptember\|[Oo]ctober\|[Nn]ovember\|[Dd]ecember
	)\.?)"""

	# Year
	re_year_num = ur"(?:19\|20)\d{2}"
	re_year_text = u"(?P<year>[A-Za-z]?" + re_year_num + u")(?:[A-Za-z]?)"
	re_year = ur"""
	\(?
	(?:%(short_month)s[,\s]\s*)? # Jul, 1980
	(?:%(month)s[,\s]\s*)? # July, 1980
	(?<!\d)
	%(year)s
	(?!\d)
	\)?
	""" % {
	'year': re_year_text,
	'short_month': re_short_month,
	'month': re_month,
	}

	# Page
	re_page_prefix = ur"[pP]?[p]?\.?\s?" # Starting page num: optional Pp.
	re_page_num = ur"[RL]?\w?\d+[cC]?" # pagenum with optional R/L
	re_page_sep = ur"\s-\s" # optional separator between pagenums
	re_page = re_page_prefix + \
	u"(?P<page>" + re_page_num + u")(?:" + re_page_sep + \
	u"(?P<page_end>" + re_page_num + u"))?"

	# Series
	re_series = ur"(?P<series>[A-H])"

	# Used for allowing 3(1991) without space
	re_look_ahead_parentesis = ur"(?=\()"
	re_sep_or_parentesis = u'(?:' + re_sep + u'\|' + re_look_ahead_parentesis + ')'

	re_look_behind_parentesis = ur"(?<=\))"
	re_sep_or_after_parentesis = u'(?:' + \
	re_sep + u'\|' + re_look_behind_parentesis + ')'


	# After having processed a line for titles, it may be possible to find more
	# numeration with the aid of the recognised titles. The following 2 patterns
	# are used for this:

	re_correct_numeration_2nd_try_ptn1 = re.compile(
	re_year + re_sep + # Year
	re_title_tag + # Recognised, tagged title
	u'(?P<aftertitle>' +
	re_sep +
	re_volume + re_sep + # The volume
	re_page + # The page
	u')', re.UNICODE\|re.VERBOSE)

	re_correct_numeration_2nd_try_ptn2 = re.compile(
	re_year + re_sep +
	re_title_tag +
	u'(?P<aftertitle>' +
	re_sep +
	re_volume + re_sep +
	re_series + re_sep +
	re_page +
	u')', re.UNICODE\|re.VERBOSE)

	re_correct_numeration_2nd_try_ptn3 = re.compile(
	re_title_tag +
	u'(?P<aftertitle>' +
	re_sep + # Recognised, tagged title
	re_volume + re_sep + # The volume
	re_page + # The page
	u')', re.UNICODE\|re.VERBOSE)


	re_correct_numeration_2nd_try_ptn4 = re.compile(
	re_title_tag +
	u'(?P<aftertitle>' +
	re_sep + # Recognised, tagged title
	re_year + ur"\s[.,\s:]\s" + # Year
	re_volume + re_sep + # The volume
	re_page + # The page
	u')', re.UNICODE\|re.VERBOSE)


	## precompile some regexps used to search for and standardize
	## numeration patterns in a line for the first time:

	## Delete the colon and expressions such as Serie, vol, V. inside the pattern
	## <serie : volume> E.g. Replace the string """Series A, Vol 4""" with """A 4"""
	re_strip_series_and_volume_labels = (re.compile(
	ur'(Serie\s\|\bS\.?\s)?([A-H])\s?[:,]\s?(\b[Vv]o?l?\.?\|\b[Nn]o\.?)?\s?(\d+)', re.UNICODE),
	ur'\g<2> \g<4>')


	## This pattern is not compiled, but rather included in
	## the other numeration paterns:
	re_nucphysb_subtitle = \
	ur'(?:[$\[]\s(?:[Ff][Ss]\|[Pp][Mm])\s\d{0,4}\s*[$\]])'
	re_nucphysb_subtitle_opt = \
	u'(?:' + re_sep + re_nucphysb_subtitle + u')?'


	## the 4 main numeration patterns:

	## Pattern 1: <vol, page, year>

	## <v, p, y>
	re_numeration_vol_page_yr = re.compile(
	re_start +
	re_volume + re_volume_sub_number_opt + re_sep +
	re_page + re_sep_or_parentesis +
	re_year, re.UNICODE\|re.VERBOSE)

	## <v, [FS], p, y>
	re_numeration_vol_nucphys_page_yr = re.compile(
	re_start +
	re_volume + re_volume_sub_number_opt + re_sep +
	re_nucphysb_subtitle + re_sep +
	re_page + re_sep_or_parentesis +
	re_year, re.UNICODE\|re.VERBOSE)

	## <[FS], v, p, y>
	re_numeration_nucphys_vol_page_yr = re.compile(
	re_start +
	re_nucphysb_subtitle + re_sep +
	re_volume + re_sep +
	re_page + re_sep_or_parentesis +
	re_year, re.UNICODE\|re.VERBOSE)

	## Pattern 2: <vol, year, page>

	## <v, y, p>
	re_numeration_vol_yr_page = re.compile(
	re_start +
	re_volume + re_sep_or_parentesis +
	re_year + re_sep_or_after_parentesis +
	re_page, re.UNICODE\|re.VERBOSE)

	## <v, sv, [FS]?, y, p>
	re_numeration_vol_subvol_nucphys_yr_page = re.compile(
	re_start +
	re_volume + re_volume_sub_number_opt +
	re_nucphysb_subtitle_opt + re_sep_or_parentesis +
	re_year + re_sep_or_after_parentesis +
	re_page, re.UNICODE\|re.VERBOSE)

	## <v, [FS]?, y, sv, p>
	re_numeration_vol_nucphys_yr_subvol_page = re.compile(
	re_start +
	re_volume + re_nucphysb_subtitle_opt +
	re_sep_or_parentesis +
	re_year + re_volume_sub_number_opt + re_sep +
	re_page, re.UNICODE\|re.VERBOSE)

	## <[FS]?, v, y, p>
	re_numeration_nucphys_vol_yr_page = re.compile(
	re_start +
	re_nucphysb_subtitle + re_sep +
	re_volume + re_sep_or_parentesis + # The volume (optional "vol"/"no")
	re_year + re_sep_or_after_parentesis + # Year
	re_page, re.UNICODE\|re.VERBOSE)

	## Pattern 3: <vol, serie, year, page>

	## <v, s, [FS]?, y, p>
	# re_numeration_vol_series_nucphys_yr_page = (re.compile(
	# re_volume + re_sep +
	# re_series + re_sep +
	# _sre_non_compiled_pattern_nucphysb_subtitle + re_sep_or_parentesis +
	# re_year + re_sep +
	# re_page, re.UNICODE\|re.VERBOSE), ur' \g<series> : ' \
	# ur'<cds.VOL>\g<vol></cds.VOL> ' \
	# ur'<cds.YR>(\g<year>)</cds.YR> ' \
	# ur'<cds.PG>\g<page></cds.PG> ')

	## <v, [FS]?, s, y, p
	re_numeration_vol_nucphys_series_yr_page = re.compile(
	re_start +
	re_volume + re_nucphysb_subtitle_opt + re_sep +
	re_series + re_sep_or_parentesis +
	re_year + re_sep_or_after_parentesis +
	re_page, re.UNICODE\|re.VERBOSE)

	## Pattern 4: <vol, serie, page, year>
	## <v, s, [FS]?, p, y>
	re_numeration_vol_series_nucphys_page_yr = re.compile(
	re_start +
	re_volume + re_sep +
	re_series + re_nucphysb_subtitle_opt + re_sep +
	re_page + re_sep +
	re_year, re.UNICODE\|re.VERBOSE)

	## <v, [FS]?, s, p, y>
	re_numeration_vol_nucphys_series_page_yr = re.compile(
	re_start +
	re_volume + re_nucphysb_subtitle_opt + re_sep +
	re_series + re_sep +
	re_page + re_sep +
	re_year, re.UNICODE\|re.VERBOSE)

	## Pattern 5: <year, vol, page>
	re_numeration_yr_vol_page = re.compile(
	re_start +
	re_year + re_sep_or_after_parentesis +
	re_volume + re_sep +
	re_page, re.UNICODE\|re.VERBOSE)


	## Pattern used to locate references of a doi inside a citation
	## This pattern matches both url (http) and 'doi:' or 'DOI' formats
	re_doi = (re.compile(ur"""
	(($?[Dd][Oo][Ii](\s)$?:?(\s)) # 'doi:' or 'doi' or '(doi)' (upper or lower case)
	\|(https?://dx\.doi\.org\/))? # or 'http://dx.doi.org/' (neither has to be present)
	(10\. # 10. (mandatory for DOI's)
	\d{4} # [0-9] x4
	/ # /
	[\w\-_:;/\.<>]+ # any character
	[\w\-_:;/<>]) # any character excluding a full stop
	""", re.VERBOSE))


	def _create_regex_pattern_add_optional_spaces_to_word_characters(word):
	"""Add the regex special characters (\s*) to allow optional spaces between
	the characters in a word.
	@param word: (string) the word to be inserted into a regex pattern.
	@return: string: the regex pattern for that word with optional spaces
	between all of its characters.
	"""
	new_word = u""
	for ch in word:
	if ch.isspace():
	new_word += ch
	else:
	new_word += ch + ur'\s*'
	return new_word


	def get_reference_section_title_patterns():
	"""Return a list of compiled regex patterns used to search for the title of
	a reference section in a full-text document.
	@return: (list) of compiled regex patterns.
	"""
	patterns = []
	titles = [u'references',
	u'references.',
	u'r\u00C9f\u00E9rences',
	u'r\u00C9f\u00C9rences',
	u'reference',
	u'refs',
	u'r\u00E9f\u00E9rence',
	u'r\u00C9f\u00C9rence',
	u'r\xb4ef\xb4erences',
	u'r\u00E9fs',
	u'r\u00C9fs',
	u'bibliography',
	u'bibliographie',
	u'citations',
	u'literaturverzeichnis']
	sect_marker = u'^\s([\[\-\{\(])?\s' \
	u'((\w\|\d){1,5}([\.\-\,](\w\|\d){1,5})?\s*' \
	u'[\.\-\}\)\]]\s*)?' \
	u'(?P<title>'
	sect_marker1 = u'^(\d){1,3}\s*(?P<title>'
	line_end = ur'(\ss\se\sc\st\si\so\sn\s)?)([\)\}\]])?' \
	ur'($\|\s[\[\{$\<]\s[1a-z]\s*[\}$\>\]]\|\:$)'

	for t in titles:
	t_ptn = re.compile(sect_marker + \
	_create_regex_pattern_add_optional_spaces_to_word_characters(t) + \
	line_end, re.I\|re.UNICODE)
	patterns.append(t_ptn)
	## allow e.g. 'N References' to be found where N is an integer
	t_ptn = re.compile(sect_marker1 + \
	_create_regex_pattern_add_optional_spaces_to_word_characters(t) + \
	line_end, re.I\|re.UNICODE)
	patterns.append(t_ptn)

	return patterns


	def get_reference_line_numeration_marker_patterns(prefix=u''):
	"""Return a list of compiled regex patterns used to search for the marker
	of a reference line in a full-text document.
	@param prefix: (string) the possible prefix to a reference line
	@return: (list) of compiled regex patterns.
	"""
	title = u""
	if type(prefix) in (str, unicode):
	title = prefix
	g_name = u'(?P<mark>'
	g_close = u')'
	space = ur'\s*'
	patterns = [
	# [1]
	space + title + g_name + ur'\[\s(?P<marknum>\d+)\s\]' + g_close,
	# [<letters and numbers]
	space + title + g_name + ur'\[\s[a-zA-Z:-]+\+?\s?(\d{1,4}[A-Za-z:-]?)?\s\]' + g_close,
	# {1}
	space + title + g_name + ur'\{\s(?P<marknum>\d+)\s\}' + g_close,
	# (1)
	space + title + g_name + ur'\<\s(?P<marknum>\d+)\s\>' + g_close,
	space + title + g_name + ur'$\s(?P<marknum>\d+)\s$' + g_close,
	space + title + g_name + ur'(?P<marknum>\d+)\s*\.(?!\d)' + g_close,
	space + title + g_name + ur'(?P<marknum>\d+)\s+' + g_close,
	space + title + g_name + ur'(?P<marknum>\d+)\s*\]' + g_close,
	# 1]
	space + title + g_name + ur'(?P<marknum>\d+)\s*\}' + g_close,
	# 1}
	space + title + g_name + ur'(?P<marknum>\d+)\s*\)' + g_close,
	# 1)
	space + title + g_name + ur'(?P<marknum>\d+)\s*\>' + g_close,
	# [1.1]
	space + title + g_name + ur'\[\s\d+\.\d+\s\]' + g_close,
	# [ ]
	space + title + g_name + ur'\[\s*\]' + g_close,
	# *
	space + title + g_name + ur'\*' + g_close,
	]
	return [re.compile(p, re.I\|re.UNICODE) for p in patterns]


	def get_reference_line_marker_pattern(pattern):
	"""Return a list of compiled regex patterns used to search for the first
	reference line in a full-text document.
	The line is considered to start with either: [1] or {1}
	The line is considered to start with : 1. or 2. or 3. etc
	The line is considered to start with : 1 or 2 etc (just a number)
	@return: (list) of compiled regex patterns.
	"""
	return re.compile(u'(?P<mark>' + pattern + u')', re.I\|re.UNICODE)

	re_reference_line_bracket_markers = get_reference_line_marker_pattern(
	ur'(?P<left>\[)\s(?P<marknum>\d+)\s(?P<right>\])'
	)
	re_reference_line_curly_bracket_markers = get_reference_line_marker_pattern(
	ur'(?P<left>\{)\s(?P<marknum>\d+)\s(?P<right>\})'
	)
	re_reference_line_dot_markers = get_reference_line_marker_pattern(
	ur'(?P<left>)\s(?P<marknum>\d+)\s(?P<right>\.)'
	)
	re_reference_line_number_markers = get_reference_line_marker_pattern(
	ur'(?P<left>)\s(?P<marknum>\d+)\s(?P<right>)'
	)


	def get_post_reference_section_title_patterns():
	"""Return a list of compiled regex patterns used to search for the title
	of the section after the reference section in a full-text document.
	@return: (list) of compiled regex patterns.
	"""
	compiled_patterns = []
	thead = ur'^\s([\{$\<\[]?\s(\w\|\d)\s[$\}\>\.\-\]]?\s)?'
	ttail = ur'(\s\:\s)?'
	numatn = ur'(\d+\|\w\b\|i{1,3}v?\|vi{0,3})[\.\,]{0,2}\b'
	roman_numbers = ur'[LVIX]'
	patterns = [
	# Section titles
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'appendix') + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'appendices') + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'acknowledgement') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'acknowledgment') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'table') + ur'\w?s?\d?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'figure') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'list of figure') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'annex') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'discussion') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'remercie') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'index') + ur's?' + ttail,
	thead + _create_regex_pattern_add_optional_spaces_to_word_characters(u'summary') + ur's?' + ttail,
	# Figure nums
	ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'figure') + numatn,
	ur'^\s' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'fig') + ur'\.\s' + numatn,
	ur'^\s' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'fig') + ur'\.?\s\d\w?\b',
	# Tables
	ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'table') + numatn,
	ur'^\s' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'tab') + ur'\.\s' + numatn,
	ur'^\s' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'tab') + ur'\.?\s\d\w?\b',
	# Other titles formats
	ur'^\s' + roman_numbers + ur'\.?\s[Cc]onclusion[\w\s]*$',
	ur'^\sAppendix\s[A-Z]\s\:\s[a-zA-Z]+\s',
	]

	for p in patterns:
	compiled_patterns.append(re.compile(p, re.I\|re.UNICODE))

	return compiled_patterns


	def get_post_reference_section_keyword_patterns():
	"""Return a list of compiled regex patterns used to search for various
	keywords that can often be found after, and therefore suggest the end of,
	a reference section in a full-text document.
	@return: (list) of compiled regex patterns.
	"""
	compiled_patterns = []
	patterns = [u'(' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'prepared') + \
	ur'\|' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'created') + \
	ur').(AAS\s)?\sLATEX',
	ur'AAS\s+?LATEX\s+?' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'macros') + u'v',
	ur'^\s*' + _create_regex_pattern_add_optional_spaces_to_word_characters(u'This paper has been produced using'),
	ur'^\s*' + \
	_create_regex_pattern_add_optional_spaces_to_word_characters(u'This article was processed by the author using Springer-Verlag') + \
	u' LATEX']
	for p in patterns:
	compiled_patterns.append(re.compile(p, re.I\|re.UNICODE))
	return compiled_patterns


	def regex_match_list(line, patterns):
	"""Given a list of COMPILED regex patters, perform the "re.match" operation
	on the line for every pattern.
	Break from searching at the first match, returning the match object.
	In the case that no patterns match, the None type will be returned.
	@param line: (unicode string) to be searched in.
	@param patterns: (list) of compiled regex patterns to search "line"
	with.
	@return: (None or an re.match object), depending upon whether one of
	the patterns matched within line or not.
	"""
	m = None
	for ptn in patterns:
	m = ptn.match(line)
	if m is not None:
	break
	return m

	# The different forms of arXiv notation
	re_arxiv_notation = re.compile(ur"""
	(arxiv)\|(e[\-\s]?print:?\s*arxiv)
	""", re.VERBOSE)

	# et. al. before J. /// means J is a journal

	re_num = re.compile(ur'(\d+)')

refextract_re.pyNo OneTemporaryActions

File Metadata

refextract_re.pyView Options

Event Timeline

refextract_re.py
No OneTemporary
Actions

refextract_re.py
View Options