Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86235033
refextract_record.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 5, 03:48
Size
9 KB
Mime Type
text/x-python
Expires
Mon, Oct 7, 03:48 (2 d)
Engine
blob
Format
Raw Data
Handle
21379729
Attached To
R3600 invenio-infoscience
refextract_record.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
from
datetime
import
datetime
from
invenio.docextract_record
import
BibRecord
,
\
BibRecordField
from
invenio.refextract_config
import
\
CFG_REFEXTRACT_FIELDS
,
\
CFG_REFEXTRACT_IND1_REFERENCE
,
\
CFG_REFEXTRACT_IND2_REFERENCE
,
\
CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS
,
\
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS
,
\
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME
,
\
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION
,
\
CFG_REFEXTRACT_VERSION
from
invenio
import
config
CFG_INSPIRE_SITE
=
getattr
(
config
,
'CFG_INSPIRE_SITE'
,
False
)
def
format_marker
(
line_marker
):
return
line_marker
.
strip
(
"[](){}. "
)
def
build_record
(
counts
,
fields
,
recid
=
None
,
status_code
=
0
):
"""Given a series of MARC XML-ized reference lines and a record-id, write a
MARC XML record to the stdout stream. Include in the record some stats
for the extraction job.
The printed MARC XML record will essentially take the following
structure:
<record>
<controlfield tag="001">1</controlfield>
<datafield tag="999" ind1="C" ind2="5">
[...]
</datafield>
[...]
<datafield tag="999" ind1="C" ind2="6">
<subfield code="a">
Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
</subfield>
</datafield>
</record>
Timestamp, error(code), reportnum, title, URL, and misc will are of
course take the relevant values.
@param status_code: (integer)the status of reference-extraction for the
given record: was there an error or not? 0 = no error; 1 = error.
@param count_reportnum: (integer) - the number of institutional
report-number citations found in the document's reference lines.
@param count_title: (integer) - the number of journal title citations
found in the document's reference lines.
@param count_url: (integer) - the number of URL citations found in the
document's reference lines.
@param count_misc: (integer) - the number of sections of miscellaneous
text (i.e. 999C5$m) from the document's reference lines.
@param count_auth_group: (integer) - the total number of author groups
identified ($h)
@param recid: (string) - the record-id of the given document. (put into
001 field.)
@param xml_lines: (list) of strings. Each string in the list contains a
group of MARC XML 999C5 datafields, making up a single reference line.
These reference lines will make up the document body.
@return: The entire MARC XML textual output, plus recognition statistics.
"""
record
=
BibRecord
(
recid
=
recid
)
record
[
'999'
]
=
fields
field
=
record
.
add_field
(
CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS
)
stats_str
=
"
%(status)s
-
%(reportnum)s
-
%(title)s
-
%(author)s
-
%(url)s
-
%(doi)s
-
%(misc)s
"
%
{
'status'
:
status_code
,
'reportnum'
:
counts
[
'reportnum'
],
'title'
:
counts
[
'title'
],
'author'
:
counts
[
'auth_group'
],
'url'
:
counts
[
'url'
],
'doi'
:
counts
[
'doi'
],
'misc'
:
counts
[
'misc'
],
}
field
.
add_subfield
(
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS
,
stats_str
)
field
.
add_subfield
(
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME
,
datetime
.
now
()
.
strftime
(
"%Y-%m-
%d
%H:%M:%S"
))
field
.
add_subfield
(
CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION
,
CFG_REFEXTRACT_VERSION
)
return
record
def
build_references
(
citations
):
"""Build marc xml from a references list
Transform the reference elements into marc xml
"""
# Now, run the method which will take as input:
# 1. A list of lists of dictionaries, where each dictionary is a piece
# of citation information corresponding to a tag in the citation.
# 2. The line marker for this entire citation line (mulitple citation
# 'finds' inside a single citation will use the same marker value)
# The resulting xml line will be a properly marked up form of the
# citation. It will take into account authors to try and split up
# references which should be read as two SEPARATE ones.
return
[
c
for
citation_elements
in
citations
for
elements
in
citation_elements
[
'elements'
]
for
c
in
build_reference_fields
(
elements
,
citation_elements
[
'line_marker'
])]
def
add_subfield
(
field
,
code
,
value
):
return
field
.
add_subfield
(
CFG_REFEXTRACT_FIELDS
[
code
],
value
)
def
add_journal_subfield
(
field
,
element
,
inspire_format
):
if
inspire_format
:
value
=
'
%(title)s
,
%(volume)s
,
%(page)s
'
%
element
else
:
value
=
'
%(title)s
%(volume)s
(
%(year)s
)
%(page)s
'
%
element
return
add_subfield
(
field
,
'journal'
,
value
)
def
create_reference_field
(
line_marker
):
field
=
BibRecordField
(
ind1
=
CFG_REFEXTRACT_IND1_REFERENCE
,
ind2
=
CFG_REFEXTRACT_IND2_REFERENCE
)
if
line_marker
.
strip
(
"., [](){}"
):
add_subfield
(
field
,
'linemarker'
,
format_marker
(
line_marker
))
return
field
def
build_reference_fields
(
citation_elements
,
line_marker
,
inspire_format
=
None
):
""" Create the MARC-XML string of the found reference information which
was taken from a tagged reference line.
@param citation_elements: (list) an ordered list of dictionary elements,
with each element corresponding to a found
piece of information from a reference line.
@param line_marker: (string) The line marker for this single reference
line (e.g. [19])
@return xml_line: (string) The MARC-XML representation of the list of
reference elements
"""
if
inspire_format
is
None
:
inspire_format
=
CFG_INSPIRE_SITE
## Begin the datafield element
current_field
=
create_reference_field
(
line_marker
)
reference_fields
=
[
current_field
]
for
element
in
citation_elements
:
## Before going onto checking 'what' the next element is, handle misc text and semi-colons
## Multiple misc text subfields will be compressed later
## This will also be the only part of the code that deals with MISC tag_typed elements
misc_txt
=
element
[
'misc_txt'
]
if
misc_txt
.
strip
(
"., [](){}"
):
misc_txt
=
misc_txt
.
lstrip
(
'])} ,.'
)
.
rstrip
(
'[({ ,.'
)
add_subfield
(
current_field
,
'misc'
,
misc_txt
)
# Now handle the type dependent actions
# JOURNAL
if
element
[
'type'
]
==
"JOURNAL"
:
add_journal_subfield
(
current_field
,
element
,
inspire_format
)
# REPORT NUMBER
elif
element
[
'type'
]
==
"REPORTNUMBER"
:
add_subfield
(
current_field
,
'reportnumber'
,
element
[
'report_num'
])
# URL
elif
element
[
'type'
]
==
"URL"
:
if
element
[
'url_string'
]
==
element
[
'url_desc'
]:
# Build the datafield for the URL segment of the reference line:
add_subfield
(
current_field
,
'url'
,
element
[
'url_string'
])
# Else, in the case that the url string and the description differ in some way, include them both
else
:
add_subfield
(
current_field
,
'url'
,
element
[
'url_string'
])
add_subfield
(
current_field
,
'urldesc'
,
element
[
'url_desc'
])
# DOI
elif
element
[
'type'
]
==
"DOI"
:
add_subfield
(
current_field
,
'doi'
,
element
[
'doi_string'
])
# AUTHOR
elif
element
[
'type'
]
==
"AUTH"
:
value
=
element
[
'auth_txt'
]
if
element
[
'auth_type'
]
==
'incl'
:
value
=
"(
%s
)"
%
value
add_subfield
(
current_field
,
'author'
,
value
)
elif
element
[
'type'
]
==
"QUOTED"
:
add_subfield
(
current_field
,
'title'
,
element
[
'title'
])
elif
element
[
'type'
]
==
"ISBN"
:
add_subfield
(
current_field
,
'isbn'
,
element
[
'ISBN'
])
elif
element
[
'type'
]
==
"BOOK"
:
add_subfield
(
current_field
,
'title'
,
element
[
'title'
])
elif
element
[
'type'
]
==
"PUBLISHER"
:
add_subfield
(
current_field
,
'publisher'
,
element
[
'publisher'
])
elif
element
[
'type'
]
==
"YEAR"
:
add_subfield
(
current_field
,
'year'
,
element
[
'year'
])
elif
element
[
'type'
]
==
"COLLABORATION"
:
add_subfield
(
current_field
,
'collaboration'
,
element
[
'collaboration'
])
elif
element
[
'type'
]
==
"RECID"
:
add_subfield
(
current_field
,
'recid'
,
str
(
element
[
'recid'
]))
for
field
in
reference_fields
:
merge_misc
(
field
)
return
reference_fields
def
merge_misc
(
field
):
current_misc
=
None
for
subfield
in
field
.
subfields
[:]:
if
subfield
.
code
==
'm'
:
if
current_misc
is
None
:
current_misc
=
subfield
else
:
current_misc
.
value
+=
" "
+
subfield
.
value
field
.
subfields
.
remove
(
subfield
)
Event Timeline
Log In to Comment