refextract_xml.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jul 6, 09:02

refextract_xml.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	import re

	from xml.sax.saxutils import escape as encode_for_xml
	from datetime import datetime

	from invenio.refextract_re import re_num
	from invenio.docextract_utils import write_message
	from invenio.refextract_config import \
	CFG_REFEXTRACT_TAG_ID_REFERENCE, \
	CFG_REFEXTRACT_IND1_REFERENCE, \
	CFG_REFEXTRACT_IND2_REFERENCE, \
	CFG_REFEXTRACT_SUBFIELD_MARKER, \
	CFG_REFEXTRACT_SUBFIELD_AUTH, \
	CFG_REFEXTRACT_SUBFIELD_TITLE, \
	CFG_REFEXTRACT_SUBFIELD_MISC, \
	CGF_REFEXTRACT_SEMI_COLON_MISC_TEXT_SENSITIVITY, \
	CFG_REFEXTRACT_SUBFIELD_REPORT_NUM, \
	CFG_REFEXTRACT_XML_RECORD_OPEN, \
	CFG_REFEXTRACT_CTRL_FIELD_RECID, \
	CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS, \
	CFG_REFEXTRACT_IND1_EXTRACTION_STATS, \
	CFG_REFEXTRACT_IND2_EXTRACTION_STATS, \
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS, \
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME, \
	CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION, \
	CFG_REFEXTRACT_VERSION, \
	CFG_REFEXTRACT_XML_RECORD_CLOSE, \
	CFG_REFEXTRACT_SUBFIELD_URL_DESCR, \
	CFG_REFEXTRACT_SUBFIELD_URL, \
	CFG_REFEXTRACT_SUBFIELD_DOI, \
	CGF_REFEXTRACT_ADJACENT_AUTH_MISC_SEPARATION, \
	CFG_REFEXTRACT_SUBFIELD_QUOTED, \
	CFG_REFEXTRACT_SUBFIELD_ISBN, \
	CFG_REFEXTRACT_SUBFIELD_PUBLISHER, \
	CFG_REFEXTRACT_SUBFIELD_YEAR, \
	CFG_REFEXTRACT_SUBFIELD_BOOK

	from invenio import config
	CFG_INSPIRE_SITE = getattr(config, 'CFG_INSPIRE_SITE', False)


	def format_marker(line_marker):
	if line_marker:
	num_match = re_num.search(line_marker)
	if num_match:
	line_marker = num_match.group(0)
	return line_marker


	def create_xml_record(counts, recid, xml_lines, status_code=0):
	"""Given a series of MARC XML-ized reference lines and a record-id, write a
	MARC XML record to the stdout stream. Include in the record some stats
	for the extraction job.
	The printed MARC XML record will essentially take the following
	structure:
	<record>
	<controlfield tag="001">1</controlfield>
	<datafield tag="999" ind1="C" ind2="5">
	[...]
	</datafield>
	[...]
	<datafield tag="999" ind1="C" ind2="6">
	<subfield code="a">
	Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
	</subfield>
	</datafield>
	</record>
	Timestamp, error(code), reportnum, title, URL, and misc will are of
	course take the relevant values.

	@param status_code: (integer)the status of reference-extraction for the
	given record: was there an error or not? 0 = no error; 1 = error.
	@param count_reportnum: (integer) - the number of institutional
	report-number citations found in the document's reference lines.
	@param count_title: (integer) - the number of journal title citations
	found in the document's reference lines.
	@param count_url: (integer) - the number of URL citations found in the
	document's reference lines.
	@param count_misc: (integer) - the number of sections of miscellaneous
	text (i.e. 999C5$m) from the document's reference lines.
	@param count_auth_group: (integer) - the total number of author groups
	identified ($h)
	@param recid: (string) - the record-id of the given document. (put into
	001 field.)
	@param xml_lines: (list) of strings. Each string in the list contains a
	group of MARC XML 999C5 datafields, making up a single reference line.
	These reference lines will make up the document body.
	@return: The entire MARC XML textual output, plus recognition statistics.
	"""
	out = []

	## Start with the opening record tag:
	out += u"%(record-open)s\n" \
	% {'record-open': CFG_REFEXTRACT_XML_RECORD_OPEN, }

	## Display the record-id controlfield:
	out += \
	u""" <controlfield tag="%(cf-tag-recid)s">%(recid)d</controlfield>\n""" \
	% {'cf-tag-recid' : CFG_REFEXTRACT_CTRL_FIELD_RECID,
	'recid' : recid,
	}

	## Loop through all xml lines and add them to the output string:
	out.extend(xml_lines)

	## add the 999C6 status subfields:
	out += u""" <datafield tag="%(df-tag-ref-stats)s" ind1="%(df-ind1-ref-stats)s" ind2="%(df-ind2-ref-stats)s">
	<subfield code="%(sf-code-ref-stats)s">%(status)s-%(reportnum)s-%(title)s-%(author)s-%(url)s-%(doi)s-%(misc)s</subfield>
	<subfield code="%(sf-code-ref-time)s">%(timestamp)s</subfield>
	<subfield code="%(sf-code-ref-version)s">%(version)s</subfield>
	</datafield>\n""" \
	% {'df-tag-ref-stats' : CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS,
	'df-ind1-ref-stats' : CFG_REFEXTRACT_IND1_EXTRACTION_STATS,
	'df-ind2-ref-stats' : CFG_REFEXTRACT_IND2_EXTRACTION_STATS,
	'sf-code-ref-stats' : CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS,
	'sf-code-ref-time' : CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME,
	'sf-code-ref-version': CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION,
	'version' : CFG_REFEXTRACT_VERSION,
	'timestamp' : datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	'status' : status_code,
	'reportnum' : counts['reportnum'],
	'title' : counts['title'],
	'author' : counts['auth_group'],
	'url' : counts['url'],
	'doi' : counts['doi'],
	'misc' : counts['misc'],
	}

	## Now add the closing tag to the record:
	out += u"%(record-close)s\n" \
	% {'record-close' : CFG_REFEXTRACT_XML_RECORD_CLOSE, }

	## Be sure to call this BEFORE compress_subfields
	out = filter_processed_references(''.join(out))
	## Compress mulitple 'm' subfields in a datafield
	out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_MISC)
	## Compress multiple 'h' subfields in a datafield
	out = compress_subfields(out, CFG_REFEXTRACT_SUBFIELD_AUTH)
	return out


	def build_xml_citations(splitted_citations, line_marker):
	return [build_xml_citation(citation_elements, line_marker) \
	for citation_elements in splitted_citations]


	def build_xml_citation(citation_elements, line_marker, inspire_format=None):
	""" Create the MARC-XML string of the found reference information which was taken
	from a tagged reference line.
	@param citation_elements: (list) an ordered list of dictionary elements,
	with each element corresponding to a found piece of information from a reference line.
	@param line_marker: (string) The line marker for this single reference line (e.g. [19])
	@return xml_line: (string) The MARC-XML representation of the list of reference elements
	"""
	if inspire_format is None:
	inspire_format = CFG_INSPIRE_SITE

	## Begin the datafield element
	xml_line = start_datafield_element(line_marker)

	## This will hold the ordering of tags which have been appended to the xml line
	## This list will be used to control the desisions involving the creation of new citation lines
	## (in the event of a new set of authors being recognised, or strange title ordering...)
	line_elements = []

	## This is a list which will hold the current 'over-view' of a single reference line,
	## as a list of lists, where each list corresponds to the contents of a datafield element
	## in the xml mark-up
	citation_structure = []
	auth_for_ibid = None

	for element in citation_elements:
	## Before going onto checking 'what' the next element is, handle misc text and semi-colons
	## Multiple misc text subfields will be compressed later
	## This will also be the only part of the code that deals with MISC tag_typed elements
	if element['misc_txt'].strip(".,:;- []"):
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_MISC,
	element['misc_txt'].strip(".,:;- []"))

	# Now handle the type dependent actions
	# TITLE
	if element['type'] == "JOURNAL":

	# Select the journal title output format
	if inspire_format:
	# ADD to current datafield
	xml_line += """
	<subfield code="%(sf-code-ref-title)s">%(title)s,%(volume)s,%(page)s</subfield>""" \
	% {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
	'title' : encode_for_xml(element['title']),
	'volume' : encode_for_xml(element['volume']),
	'page' : encode_for_xml(element['page']),
	}
	else:
	# ADD to current datafield
	xml_line += """
	<subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>""" \
	% {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
	'title' : encode_for_xml(element['title']),
	'volume' : encode_for_xml(element['volume']),
	'year' : encode_for_xml(element['year']),
	'page' : encode_for_xml(element['page']),
	}

	# Now, if there are any extra (numeration based) IBID's after this title
	if len(element['extra_ibids']) > 0:
	# At least one IBID is present, these are to be outputted each into their own datafield
	for ibid in element['extra_ibids']:
	# %%%%% Set as NEW citation line %%%%%
	(xml_line, auth_for_ibid) = append_datafield_element(line_marker,
	citation_structure,
	line_elements,
	auth_for_ibid,
	xml_line)
	if inspire_format:
	xml_line += """
	<subfield code="%(sf-code-ref-title)s">%(title)s,%(volume)s,%(page)s</subfield>""" \
	% {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
	'title' : encode_for_xml(ibid['title']),
	'volume' : encode_for_xml(ibid['volume']),
	'page' : encode_for_xml(ibid['page']),
	}
	else:
	xml_line += """
	<subfield code="%(sf-code-ref-title)s">%(title)s %(volume)s (%(year)s) %(page)s</subfield>""" \
	% {'sf-code-ref-title': CFG_REFEXTRACT_SUBFIELD_TITLE,
	'title' : encode_for_xml(ibid['title']),
	'volume' : encode_for_xml(ibid['volume']),
	'year' : encode_for_xml(ibid['year']),
	'page' : encode_for_xml(ibid['page']),
	}
	# Add a Title element to the past elements list, since we last found an IBID
	line_elements.append(element)

	# REPORT NUMBER
	elif element['type'] == "REPORTNUMBER":
	# ADD to current datafield
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_REPORT_NUM,
	element['report_num'])
	line_elements.append(element)

	# URL
	elif element['type'] == "URL":
	if element['url_string'] == element['url_desc']:
	# Build the datafield for the URL segment of the reference line:
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_URL,
	element['url_string'])
	# Else, in the case that the url string and the description differ in some way, include them both
	else:
	# Build the datafield for the URL segment of the reference line:
	xml_line += """
	<subfield code="%(sf-code-ref-url)s">%(url)s</subfield>
	<subfield code="%(sf-code-ref-url-desc)s">%(url-desc)s</subfield>""" \
	% {'sf-code-ref-url' : CFG_REFEXTRACT_SUBFIELD_URL,
	'sf-code-ref-url-desc': CFG_REFEXTRACT_SUBFIELD_URL_DESCR,
	'url' : encode_for_xml(element['url_string']),
	'url-desc' : encode_for_xml(element['url_desc'])
	}
	line_elements.append(element)

	# DOI
	elif element['type'] == "DOI":
	## Split on hitting another DOI in the same line
	if is_in_line_elements("DOI", line_elements):
	## %%%%% Set as NEW citation line %%%%%
	xml_line, auth_for_ibid = append_datafield_element(line_marker,
	citation_structure,
	line_elements,
	auth_for_ibid,
	xml_line)
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_DOI,
	element['doi_string'])
	line_elements.append(element)

	# AUTHOR
	elif element['type'] == "AUTH":
	value = element['auth_txt']
	if element['auth_type'] == 'incl':
	value = "(%s)" % value

	if is_in_line_elements("AUTH", line_elements) and line_elements[-1]['type'] != "AUTH":
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_MISC,
	value)
	else:
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_AUTH,
	value)
	line_elements.append(element)

	elif element['type'] == "QUOTED":
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_QUOTED,
	element['title'])
	line_elements.append(element)

	elif element['type'] == "ISBN":
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_ISBN,
	element['ISBN'])
	line_elements.append(element)

	elif element['type'] == "BOOK":
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_QUOTED,
	element['title'])
	xml_line += '\n <subfield code="%s" />' % \
	CFG_REFEXTRACT_SUBFIELD_BOOK
	line_elements.append(element)

	elif element['type'] == "PUBLISHER":
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_PUBLISHER,
	element['publisher'])
	line_elements.append(element)

	elif element['type'] == "YEAR":
	xml_line = append_subfield_element(xml_line,
	CFG_REFEXTRACT_SUBFIELD_YEAR,
	element['year'])
	line_elements.append(element)

	# Append the author, if needed for an ibid, for the last element
	# in the entire line. Don't bother setting the author to be used
	# for ibids, since the line is finished
	xml_line += check_author_for_ibid(line_elements, auth_for_ibid)[0]

	# Close the ending datafield element
	xml_line += "\n </datafield>\n"

	return xml_line


	def append_subfield_element(xml_line, subfield_code, value):
	xml_element = '\n <subfield code="' \
	'%(sf-code-ref-auth)s">%(value)s</subfield>' % {
	'value' : encode_for_xml(value),
	'sf-code-ref-auth' : subfield_code,
	}
	return xml_line + xml_element


	def start_datafield_element(line_marker):
	""" Start a brand new datafield element with a marker subfield.
	@param line_marker: (string) The line marker which will be the sole
	content of the newly created marker subfield. This will always be the
	first subfield to be created for a new datafield element.
	@return: (string) The string holding the relevant datafield and
	subfield tags.
	"""
	marker_subfield = """
	<subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \
	% {'sf-code-ref-marker': CFG_REFEXTRACT_SUBFIELD_MARKER,
	'marker-val' : encode_for_xml(format_marker(line_marker))}

	new_datafield = """ <datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">%(marker-subfield)s""" \
	% {'df-tag-ref' : CFG_REFEXTRACT_TAG_ID_REFERENCE,
	'df-ind1-ref' : CFG_REFEXTRACT_IND1_REFERENCE,
	'df-ind2-ref' : CFG_REFEXTRACT_IND2_REFERENCE,
	'marker-subfield': marker_subfield}

	return new_datafield


	def dump_or_split_author(misc_txt, line_elements):
	"""
	Given the list of current elements, and misc text, try to decide how to use this
	author for splitting heuristics, and see if it is useful. Returning 'dump' indicates
	put this author into misc text, since it had been identified as bad. 'split'
	indicates split the line and place this author into the fresh datafield. The empty string
	indicates add this author as normal to the current xml datafield.

	A line will be split using author information in two situations:
	1. When there already exists a previous author group in the same line
	2. If the only item in the current line is a title, with no misc text
	In both situations, the newly found author element is placed into the newly created
	datafield.

	This method heavily assumes that the first author group found in a single citation is the
	most reliable (In accordance with the IEEE standard, which states that authors should
	be written at the beginning of a citation, in the overwhelming majority of cases).
	@param misc_txt: (string) The misc text for this reference line
	@param line_elements: (list) The list of elements found for this current line
	@return: (string) The action to take to deal with this author.
	"""
	## If an author has already been found in this reference line
	if is_in_line_elements("AUTH", line_elements):

	## If this author group is directly after another author group,
	## with minimal misc text between, then this author group is very likely to be wrong.
	if line_elements[-1]['type'] == "AUTH" \
	and len(misc_txt) < CGF_REFEXTRACT_ADJACENT_AUTH_MISC_SEPARATION:
	return "dump"
	## Else, trigger a new reference line
	return "split"

	## In cases where an author is directly after an alone title (ibid or normal, with no misc),
	## Trigger a new reference line
	if is_in_line_elements("JOURNAL", line_elements) and len(line_elements) == 1 \
	and len(misc_txt) == 0:
	return "split"

	return ""


	def is_in_line_elements(element_type, line_elements):
	""" Checks the list of current elements in the line for the given element type """
	for i, element in enumerate(line_elements):
	if element['type'] == element_type:
	return (True, line_elements[i])
	return False


	def split_on_semi_colon(misc_txt, line_elements, elements_processed, total_elements):
	""" Given some misc text, see if there are any semi-colons which may indiciate that
	a reference line is in fact two separate citations.
	@param misc_txt: (string) The misc_txt to look for semi-colons within.
	@param line_elements: (list) The list of single upper-case chars which
	represent an element of a reference which has been processed.
	@param elements_processed: (integer) The number of elements which have been
	looked at for this entire reference line, regardless of splits
	@param total_elements: (integer) The total number of elements which
	have been identified in the entire reference line
	@return: (string) Dipicting where the semi-colon was found in relation to the
	rest of the misc_txt. False if a semi-colon was not found, or one was found
	relating to an escaped piece of text.
	"""
	## If there has already been meaningful information found in the reference
	## and there are still elements to be processed beyond the element relating to
	## this misc_txt
	if (is_in_line_elements("JOURNAL", line_elements) \
	or is_in_line_elements("REPORTNUMBER", line_elements) \
	or len(misc_txt) >= CGF_REFEXTRACT_SEMI_COLON_MISC_TEXT_SENSITIVITY) \
	and elements_processed < total_elements:

	if len(misc_txt) >= 4 and \
	(misc_txt[-5:] == '&' or misc_txt[-4:] == '<'):
	## This is a semi-colon which does not indicate a new citation
	return ""
	else:
	## If a semi-colon is at the end, make sure to append preceeding misc_txt to
	## the current datafield element
	if misc_txt.strip(" .,")[-1] == ";":
	return "after"
	## Else, make sure to append the misc_txt to the newly created datafield element
	elif misc_txt.strip(" .,")[0] == ";":
	return "before"
	return ""


	def check_author_for_ibid(line_elements, author):
	""" Given a list of elements for an entire reference line, and the current
	author element to be used for ibids, check to see if that author element needs
	to be inserted into this line, depending on the presence of ibids and whether
	or not there is already an author paired with an ibid.
	Also, if no ibids are present in the line, see if the author element needs
	to be updated, depending on the presence of a normal title and a corresponding
	author group.
	@param line_elements: List of line elements for the entire processed reference
	line
	@param author: The current parent author element to be used with an ibid
	@return: (tuple) - containing a possible new author subfield, and the parent
	author element to be used for future ibids (if any)
	"""
	## Upon splitting, check for ibids in the previous line,
	## If an appropriate author was found, pair it with this ibid.
	## (i.e., an author has not been explicitly paired with this ibid already
	## and an author exists with the parent title to which this ibid refers)
	if is_in_line_elements("JOURNAL", line_elements):
	## Get the title element for this line
	title_element = is_in_line_elements("JOURNAL", line_elements)[1]

	if author != None and not is_in_line_elements("AUTH", line_elements) \
	and title_element['is_ibid']:
	## Return the author subfield which needs to be appended for an ibid in the line
	## No need to reset the author to be used for ibids, since this line holds an ibid
	return """
	<subfield code="%(sf-code-ref-auth)s">%(authors)s</subfield>""" \
	% {'authors' : encode_for_xml(author['auth_txt'].strip('()')),
	'sf-code-ref-auth' : CFG_REFEXTRACT_SUBFIELD_AUTH,
	}, author

	## Set the author for to be used for ibids, when a standard title is present in this line,
	## as well as an author
	if not title_element['is_ibid'] and is_in_line_elements("AUTH", line_elements):
	## Set the author to be used for ibids, in the event that a subsequent ibid is found
	## this author element will be repeated.
	## This author is only used when an ibid is in a line
	## and there is no other author found in the line.
	author = is_in_line_elements("AUTH", line_elements)[1]
	## If there is no author associated with this head title, clear the author to be used for ibids
	elif not title_element['is_ibid']:
	author = None

	## If an author does not need to be replicated for an ibid, append nothing to the xml line
	return "", author


	def append_datafield_element(line_marker,
	citation_structure,
	line_elements,
	author,
	xml_line):
	""" Finish the current datafield element and start a new one, with a new
	marker subfield.
	@param line_marker: (string) The line marker which will be the sole
	content of the newly created marker subfield. This will always be the
	first subfield to be created for a new datafield element.
	@return new_datafield: (string) The string holding the relevant
	datafield and subfield tags.
	"""
	## Add an author, if one must be added for ibid's, before splitting this line
	## Also, if a standard title and an author are both present, save the author for future use
	new_datafield, author = check_author_for_ibid(line_elements, author)

	xml_line += new_datafield
	## Start the new datafield
	xml_line += """
	</datafield>
	<datafield tag="%(df-tag-ref)s" ind1="%(df-ind1-ref)s" ind2="%(df-ind2-ref)s">
	<subfield code="%(sf-code-ref-marker)s">%(marker-val)s</subfield>""" \
	% {'df-tag-ref' : CFG_REFEXTRACT_TAG_ID_REFERENCE,
	'df-ind1-ref' : CFG_REFEXTRACT_IND1_REFERENCE,
	'df-ind2-ref' : CFG_REFEXTRACT_IND2_REFERENCE,
	'sf-code-ref-marker' : CFG_REFEXTRACT_SUBFIELD_MARKER,
	'marker-val' : encode_for_xml(format_marker(line_marker))
	}

	## add the past elements for end previous citation to the citation_structure list
	## (citation_structure is a reference to the initial citation_structure list found in the calling method)
	citation_structure.append(line_elements)

	## Clear the elements in the referenced list of elements
	del line_elements[:]

	return xml_line, author


	def filter_processed_references(out):
	""" apply filters to reference lines found - to remove junk"""
	reference_lines = out.split('\n')

	# Removes too long and too short m tags
	m_restricted, ref_lines = restrict_m_subfields(reference_lines)

	if m_restricted:
	a_tag = re.compile('\<subfield code=\"a\"\>(.*?)\<\/subfield\>')
	for i in range(len(ref_lines)):
	# Checks to see that the datafield has the attribute ind2="6",
	# Before looking to see if the subfield code attribute is 'a'
	if ref_lines[i].find('<datafield tag="999" ind1="C" ind2="6">') != -1 \
	and (len(ref_lines) - 1) > i:
	# For each line in this datafield element, try to find the subfield whose code attribute is 'a'
	while ref_lines[i].find('</datafield>') != -1 and (len(ref_lines) - 1) > i:
	i += 1
	# <subfield code="a">Invenio/X.XX.X
	# refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
	# remake the "a" tag for new numbe of "m" tags
	if a_tag.search(ref_lines[i]):
	data = a_tag.search(ref_lines[i]).group(1)
	words1 = data.split()
	words2 = words1[-1].split('-')
	old_m = int(words2[-1])
	words2[-1] = str(old_m - m_restricted)
	data1 = '-'.join(words2)
	words1[-1] = data1
	new_data = ' '.join(words1)
	ref_lines[i] = ' <subfield code="a">' + new_data + '</subfield>'
	break

	new_out = '\n'.join([l for l in [rec.rstrip() for rec in ref_lines] if l])

	if len(reference_lines) != len(new_out):
	write_message(" * filter results: unfilter references line length is %d and filtered length is %d" \
	% (len(reference_lines), len(new_out)), verbose=2)

	return new_out


	def restrict_m_subfields(reference_lines):
	"""Remove complete datafields which hold ONLY a single 'm' subfield,
	AND where the misc content is too short or too long to be of use.
	Min and max lengths derived by inspection of actual data. """
	min_length = 4
	max_length = 1024
	m_tag = re.compile('\<subfield code=\"m\"\>(.*?)\<\/subfield\>')
	filter_list = []
	m_restricted = 0
	for i in range(len(reference_lines)): # set up initial filter
	filter_list.append(1)
	for i in range(len(reference_lines)):
	if m_tag.search(reference_lines[i]):
	if (i - 2) >= 0 and (i + 1) < len(reference_lines):
	if reference_lines[i + 1].find('</datafield>') != -1 and \
	reference_lines[i - 1].find('<subfield code="o">') != -1 and \
	reference_lines[i - 2].find('<datafield') != -1:
	## If both of these are true then its a solitary "m" tag
	mlength = len(m_tag.search(reference_lines[i]).group(1))
	if mlength < min_length or mlength > max_length:
	filter_list[i - 2] = filter_list[i - 1] = filter_list[i] = filter_list[i + 1] = 0
	m_restricted += 1
	new_reference_lines = []
	for i in range(len(reference_lines)):
	if filter_list[i]:
	new_reference_lines.append(reference_lines[i])
	return m_restricted, new_reference_lines


	def get_subfield_content(line, subfield_code):
	""" Given a line (subfield element) and a xml code attribute for a subfield element,
	return the contents of the subfield element.
	"""
	content = line.split('<subfield code="' + subfield_code + '">')[1]
	content = content.split('</subfield>')[0]
	return content


	def compress_subfields(out, subfield_code):
	"""
	For each datafield, compress multiple subfields of type 'subfield_code' into a single one
	e.g. for MISC text, change xml format from:
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1.</subfield>
	<subfield code="m">J. Dukelsky, S. Pittel and G. Sierra</subfield>
	<subfield code="s">Rev. Mod. Phys. 76 (2004) 643</subfield>
	<subfield code="m">and this is some more misc text</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2.</subfield>
	<subfield code="m">J. von Delft and D.C. Ralph,</subfield>
	<subfield code="s">Phys. Rep. 345 (2001) 61</subfield>
	</datafield>
	to:
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">1.</subfield>
	<subfield code="m">J. Dukelsky, S. Pittel and G. Sierra and this is some more misc text</subfield>
	<subfield code="s">Rev. Mod. Phys. 76 (2004) 643</subfield>
	</datafield>
	<datafield tag="999" ind1="C" ind2="5">
	<subfield code="o">2.</subfield>
	<subfield code="m">J. von Delft and D.C. Ralph,</subfield>
	<subfield code="s">Phys. Rep. 345 (2001) 61</subfield>
	</datafield>
	"""
	in_lines = out.split('\n')
	# hold the subfield compressed version of the xml, line by line
	new_rec_lines = []
	# Used to indicate when the selected subfield has already been reached
	# inside a particular datafield
	position = 0
	# Where the concatenated misc text is held before appended at the end
	content_text = ""
	# Components of the misc subfield elements
	subfield_start = " <subfield code=\"%s\">" % subfield_code
	subfield_end = "</subfield>"

	for line in in_lines:
	## If reached the end of the datafield
	if line.find('</datafield>') != -1:
	if len(content_text) > 0:
	# Insert the concatenated misc contents back where it was first
	# encountered (dont RIGHTstrip semi-colons, as these may be
	# needed for & or <)
	if subfield_code == 'm':
	content_text = content_text.strip(" ,.").lstrip(" ;")
	new_rec_lines[position] = new_rec_lines[position] + \
	content_text + subfield_end
	content_text = ""
	position = 0
	new_rec_lines.append(line)
	# Found subfield in question, concatenate subfield contents
	# for this single datafield
	elif line.find(subfield_start.strip()) != -1:
	if position == 0:
	## Save the position of this found subfield
	## for later insertion into the same place
	new_rec_lines.append(subfield_start)
	position = len(new_rec_lines) - 1
	new_text = get_subfield_content(line, subfield_code)
	if content_text and new_text:
	## Append spaces between merged text, if needed
	if (content_text[-1] + new_text[0]).find(" ") == -1:
	new_text = " " + new_text
	content_text += new_text
	else:
	new_rec_lines.append(line)

	## Create the readable file from the list of lines.
	new_out = [l.rstrip() for l in new_rec_lines]
	return '\n'.join(filter(None, new_out))

refextract_xml.pyNo OneTemporaryActions

File Metadata

refextract_xml.pyView Options

Event Timeline

refextract_xml.py
No OneTemporary
Actions

refextract_xml.py
View Options