Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90741603
refextract_engine.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Nov 4, 08:32
Size
43 KB
Mime Type
text/x-python
Expires
Wed, Nov 6, 08:32 (2 d)
Engine
blob
Format
Raw Data
Handle
22127501
Attached To
R3600 invenio-infoscience
refextract_engine.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""This is the main body of refextract. It is used to extract references from
fulltext PDF documents.
"""
__revision__
=
"$Id$"
import
re
import
os
import
subprocess
from
itertools
import
chain
from
invenio.refextract_config
import
\
CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM
,
\
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL
,
\
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND
,
\
CFG_REFEXTRACT_MARKER_CLOSING_VOLUME
,
\
CFG_REFEXTRACT_MARKER_CLOSING_YEAR
,
\
CFG_REFEXTRACT_MARKER_CLOSING_PAGE
,
\
CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID
,
\
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL
,
\
CFG_REFEXTRACT_MARKER_CLOSING_TITLE
,
\
CFG_REFEXTRACT_MARKER_CLOSING_SERIES
# make refextract runnable without requiring the full Invenio installation:
from
invenio.config
import
CFG_PATH_GFILE
from
invenio.refextract_tag
import
tag_reference_line
,
\
sum_2_dictionaries
,
identify_and_tag_DOI
,
identify_and_tag_URLs
from
invenio.refextract_xml
import
create_xml_record
,
\
build_xml_citations
from
invenio.docextract_pdf
import
convert_PDF_to_plaintext
from
invenio.docextract_utils
import
write_message
from
invenio.refextract_kbs
import
get_kbs
from
invenio.refextract_linker
import
find_referenced_recid
from
invenio.refextract_re
import
get_reference_line_numeration_marker_patterns
,
\
regex_match_list
,
\
re_tagged_citation
,
\
re_numeration_no_ibid_txt
,
\
re_roman_numbers
,
\
re_recognised_numeration_for_title_plus_series
description
=
"""
Refextract tries to extract the reference section from a full-text document.
Extracted reference lines are processed and any recognised citations are
marked up using MARC XML. Recognises author names, URL's, DOI's, and also
journal titles and report numbers as per the relevant knowledge bases. Results
are output to the standard output stream as default, or instead to an xml file.
"""
# General initiation tasks:
# components relating to the standardisation and
# recognition of citations in reference lines:
def
remove_reference_line_marker
(
line
):
"""Trim a reference line's 'marker' from the beginning of the line.
@param line: (string) - the reference line.
@return: (tuple) containing two strings:
+ The reference line's marker (or if there was not one,
a 'space' character.
+ The reference line with it's marker removed from the
beginning.
"""
# Get patterns to identify reference-line marker patterns:
marker_patterns
=
get_reference_line_numeration_marker_patterns
()
line
=
line
.
lstrip
()
marker_match
=
regex_match_list
(
line
,
marker_patterns
)
if
marker_match
is
not
None
:
# found a marker:
marker_val
=
marker_match
.
group
(
u'mark'
)
# trim the marker from the start of the line:
line
=
line
[
marker_match
.
end
():]
.
lstrip
()
else
:
marker_val
=
u" "
return
(
marker_val
,
line
)
def
roman2arabic
(
num
):
"""Convert numbers from roman to arabic
This function expects a string like XXII
and outputs an integer
"""
t
=
0
p
=
0
for
r
in
num
:
n
=
10
**
(
205558
%
ord
(
r
)
%
7
)
%
9995
t
+=
n
-
2
*
p
%
n
p
=
n
return
t
## Transformations
def
format_volume
(
citation_elements
):
"""format volume number (roman numbers to arabic)
When the volume number is expressed in roman numbers (CXXII),
they are converted to their equivalent in arabic numbers (42)
"""
re_roman
=
re
.
compile
(
re_roman_numbers
+
u'$'
,
re
.
UNICODE
)
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'JOURNAL'
and
re_roman
.
match
(
el
[
'volume'
]):
el
[
'volume'
]
=
str
(
roman2arabic
(
el
[
'volume'
]
.
upper
()))
return
citation_elements
def
handle_special_journals
(
citation_elements
,
kbs
):
"""format special journals (like JHEP) volume number
JHEP needs the volume number prefixed with the year
e.g. JHEP 0301 instead of JHEP 01
"""
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'JOURNAL'
and
el
[
'title'
]
in
kbs
[
'special_journals'
]
\
and
re
.
match
(
'\d{1,2}$'
,
el
[
'volume'
]):
# Sometimes the page is omitted and the year is written in its place
# We can never be sure but it's very likely that page > 1900 is
# actually a year, so we skip this reference
if
el
[
'year'
]
==
''
and
re
.
match
(
'(19|20)\d{2}$'
,
el
[
'page'
]):
el
[
'type'
]
=
'MISC'
el
[
'misc_txt'
]
=
"
%s
,
%s
,
%s
"
\
%
(
el
[
'title'
],
el
[
'volume'
],
el
[
'page'
])
el
[
'volume'
]
=
el
[
'year'
][
-
2
:]
+
'
%02d
'
%
int
(
el
[
'volume'
])
return
citation_elements
def
format_report_number
(
citation_elements
):
"""Format report numbers that are missing a dash
e.g. CERN-LCHH2003-01 to CERN-LHCC-2003-01
"""
re_report
=
re
.
compile
(
ur'^(?P<name>[A-Z-]+)(?P<nums>[\d-]+)$'
,
re
.
UNICODE
)
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'REPORTNUMBER'
:
m
=
re_report
.
match
(
el
[
'report_num'
])
if
m
:
name
=
m
.
group
(
'name'
)
if
not
name
.
endswith
(
'-'
):
el
[
'report_num'
]
=
m
.
group
(
'name'
)
+
'-'
+
m
.
group
(
'nums'
)
return
citation_elements
def
format_hep
(
citation_elements
):
"""Format hep-th report numbers with a dash
e.g. replaces hep-th-9711200 with hep-th/9711200
"""
prefixes
=
(
'astro-ph-'
,
'hep-th-'
,
'hep-ph-'
,
'hep-ex-'
,
'hep-lat-'
,
'math-ph-'
)
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'REPORTNUMBER'
:
for
p
in
prefixes
:
if
el
[
'report_num'
]
.
startswith
(
p
):
el
[
'report_num'
]
=
el
[
'report_num'
][:
len
(
p
)
-
1
]
+
'/'
+
\
el
[
'report_num'
][
len
(
p
):]
return
citation_elements
def
format_author_ed
(
citation_elements
):
"""Standardise to (ed.) and (eds.)
e.g. Remove extra space in (ed. )
"""
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'AUTH'
:
el
[
'auth_txt'
]
=
el
[
'auth_txt'
]
.
replace
(
'(ed. )'
,
'(ed.)'
)
el
[
'auth_txt'
]
=
el
[
'auth_txt'
]
.
replace
(
'(eds. )'
,
'(eds.)'
)
return
citation_elements
def
look_for_books
(
citation_elements
,
kbs
):
"""Look for books in our kb
Create book tags by using the authors and the title to find books
in our knowledge base
"""
authors
=
None
title
=
None
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'AUTH'
:
authors
=
el
break
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'QUOTED'
:
title
=
el
break
if
authors
and
title
:
if
title
[
'title'
]
.
upper
()
in
kbs
[
'books'
]:
line
=
kbs
[
'books'
][
title
[
'title'
]
.
upper
()]
el
=
{
'type'
:
'BOOK'
,
'misc_txt'
:
''
,
'authors'
:
line
[
0
],
'title'
:
line
[
1
],
'year'
:
line
[
2
]
.
strip
(
';'
)}
citation_elements
.
append
(
el
)
citation_elements
.
remove
(
title
)
return
citation_elements
def
split_volume_from_journal
(
citation_elements
):
"""Split volume from journal title
We need this because sometimes the volume is attached to the journal title
instead of the volume. In those cases we move it here from the title to the
volume
"""
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'JOURNAL'
and
';'
in
el
[
'title'
]:
el
[
'title'
],
series
=
el
[
'title'
]
.
rsplit
(
';'
,
1
)
el
[
'volume'
]
=
series
+
el
[
'volume'
]
return
citation_elements
def
remove_b_for_nucl_phys
(
citation_elements
):
"""Removes b from the volume of some journals
Removes the B from the volume for Nucl.Phys.Proc.Suppl. because in INSPIRE
that journal is handled differently.
"""
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'JOURNAL'
and
el
[
'title'
]
==
'Nucl.Phys.Proc.Suppl.'
\
and
'volume'
in
el
\
and
(
el
[
'volume'
]
.
startswith
(
'b'
)
or
el
[
'volume'
]
.
startswith
(
'B'
)):
el
[
'volume'
]
=
el
[
'volume'
][
1
:]
return
citation_elements
def
mangle_volume
(
citation_elements
):
"""Make sure the volume letter is before the volume number
e.g. transforms 100B to B100
"""
volume_re
=
re
.
compile
(
ur"(\d+)([A-Z])"
,
re
.
U
|
re
.
I
)
for
el
in
citation_elements
:
if
el
[
'type'
]
==
'JOURNAL'
:
matches
=
volume_re
.
match
(
el
[
'volume'
])
if
matches
:
el
[
'volume'
]
=
matches
.
group
(
2
)
+
matches
.
group
(
1
)
return
citation_elements
def
balance_authors
(
splitted_citations
,
new_elements
):
if
not
splitted_citations
:
return
last_citation
=
splitted_citations
[
-
1
]
current_citation
=
new_elements
if
last_citation
[
-
1
][
'type'
]
==
'AUTH'
\
and
sum
([
1
for
cit
in
last_citation
if
cit
[
'type'
]
==
'AUTH'
])
>
1
:
el
=
last_citation
.
pop
()
current_citation
.
insert
(
0
,
el
)
def
split_citations
(
citation_elements
):
"""Split a citation line in multiple citations
We handle the case where the author has put 2 citations in the same line
but split with ; or some other method.
"""
splitted_citations
=
[]
new_elements
=
[]
current_recid
=
None
def
check_ibid
(
current_elements
,
trigger_el
):
# Check for ibid
if
trigger_el
.
get
(
'is_ibid'
,
False
):
if
splitted_citations
:
els
=
chain
(
reversed
(
current_elements
),
reversed
(
splitted_citations
[
-
1
]))
else
:
els
=
reversed
(
current_elements
)
for
el
in
els
:
if
el
[
'type'
]
==
'AUTH'
:
new_elements
.
append
(
el
.
copy
())
break
def
start_new_citation
():
"""Start new citation"""
splitted_citations
.
append
(
new_elements
[:])
del
new_elements
[:]
to_merge
=
None
for
el
in
citation_elements
:
if
to_merge
:
el
[
'misc_txt'
]
=
to_merge
+
" "
+
el
.
get
(
'misc_txt'
,
''
)
to_merge
=
None
try
:
el_recid
=
find_referenced_recid
(
el
)
.
pop
()
except
(
IndexError
,
KeyError
):
el_recid
=
None
if
current_recid
and
el_recid
and
current_recid
==
el_recid
:
# Do not start a new citation
pass
elif
current_recid
and
el_recid
and
current_recid
!=
el_recid
:
start_new_citation
()
# Some authors may be found in the previous citation
balance_authors
(
splitted_citations
,
new_elements
)
elif
';'
in
el
[
'misc_txt'
]
and
valid_citation
(
new_elements
):
el
[
'misc_txt'
],
to_merge
=
el
[
'misc_txt'
]
.
rsplit
(
';'
,
1
)
start_new_citation
()
if
el_recid
:
current_recid
=
el_recid
check_ibid
(
new_elements
,
el
)
new_elements
.
append
(
el
)
if
to_merge
:
new_elements
[
-
1
][
'misc_txt'
]
+=
" "
+
to_merge
new_elements
[
-
1
][
'misc_txt'
]
=
new_elements
[
-
1
][
'misc_txt'
]
.
strip
()
splitted_citations
.
append
(
new_elements
)
return
splitted_citations
def
valid_citation
(
citation
):
els_to_remove
=
(
'MISC'
,
)
for
el
in
citation
:
if
el
[
'type'
]
not
in
els_to_remove
:
return
True
return
False
def
remove_invalid_references
(
splitted_citations
):
def
add_misc
(
el
,
txt
):
if
not
el
.
get
(
'misc_txt'
):
el
[
'misc_txt'
]
=
txt
else
:
el
[
'misc_txt'
]
+=
" "
+
txt
splitted_citations
=
[
citation
for
citation
in
splitted_citations
\
if
citation
]
# We merge some elements in here which means it only makes sense when
# we have at least 2 elements to merge together
if
len
(
splitted_citations
)
>
1
:
previous_citation
=
None
for
citation
in
splitted_citations
:
if
not
valid_citation
(
citation
):
# Merge to previous one misc txt
if
previous_citation
:
citation_to_merge_into
=
previous_citation
else
:
citation_to_merge_into
=
splitted_citations
[
1
]
for
el
in
citation
:
add_misc
(
citation_to_merge_into
[
-
1
],
el
[
'misc_txt'
])
previous_citation
=
citation
return
[
citation
for
citation
in
splitted_citations
\
if
valid_citation
(
citation
)]
def
add_year_elements
(
splitted_citations
):
for
citation
in
splitted_citations
:
for
el
in
citation
:
if
el
[
'type'
]
==
'YEAR'
:
continue
year
=
None
for
el
in
citation
:
if
el
[
'type'
]
==
'JOURNAL'
or
el
[
'type'
]
==
'BOOK'
\
and
'year'
in
el
:
year
=
el
[
'year'
]
break
if
year
:
citation
.
append
({
'type'
:
'YEAR'
,
'year'
:
year
,
'misc_txt'
:
''
,
})
return
splitted_citations
## End of elements transformations
def
print_citations
(
splitted_citations
,
line_marker
):
write_message
(
'* splitted_citations'
,
verbose
=
9
)
write_message
(
' * line marker
%s
'
%
line_marker
,
verbose
=
9
)
for
citation
in
splitted_citations
:
write_message
(
" * elements"
,
verbose
=
9
)
for
el
in
citation
:
write_message
(
' *
%s
%s
'
%
(
el
[
'type'
],
repr
(
el
)),
verbose
=
9
)
def
parse_reference_line
(
ref_line
,
kbs
,
bad_titles_count
=
{}):
"""Parse one reference line
@input a string representing a single reference bullet
@output parsed references (a list of elements objects)
"""
# Strip the 'marker' (e.g. [1]) from this reference line:
(
line_marker
,
ref_line
)
=
remove_reference_line_marker
(
ref_line
)
# Find DOI sections in citation
(
ref_line
,
identified_dois
)
=
identify_and_tag_DOI
(
ref_line
)
# Identify and replace URLs in the line:
(
ref_line
,
identified_urls
)
=
identify_and_tag_URLs
(
ref_line
)
# Tag <cds.JOURNAL>, etc.
tagged_line
,
bad_titles_count
=
tag_reference_line
(
ref_line
,
kbs
,
bad_titles_count
)
# Debug print tagging (authors, titles, volumes, etc.)
write_message
(
'* tags
%r
'
%
tagged_line
,
verbose
=
9
)
# Using the recorded information, create a MARC XML representation
# of the rebuilt line:
# At the same time, get stats of citations found in the reference line
# (titles, urls, etc):
citation_elements
,
line_marker
,
counts
=
\
parse_tagged_reference_line
(
line_marker
,
tagged_line
,
identified_dois
,
identified_urls
)
# Transformations on elements
citation_elements
=
split_volume_from_journal
(
citation_elements
)
citation_elements
=
format_volume
(
citation_elements
)
citation_elements
=
handle_special_journals
(
citation_elements
,
kbs
)
citation_elements
=
format_report_number
(
citation_elements
)
citation_elements
=
format_author_ed
(
citation_elements
)
citation_elements
=
look_for_books
(
citation_elements
,
kbs
)
citation_elements
=
format_hep
(
citation_elements
)
citation_elements
=
remove_b_for_nucl_phys
(
citation_elements
)
citation_elements
=
mangle_volume
(
citation_elements
)
# Split the reference in multiple ones if needed
splitted_citations
=
split_citations
(
citation_elements
)
# Remove references with only misc text
splitted_citations
=
remove_invalid_references
(
splitted_citations
)
# Find year
splitted_citations
=
add_year_elements
(
splitted_citations
)
# For debugging puposes
print_citations
(
splitted_citations
,
line_marker
)
return
splitted_citations
,
line_marker
,
counts
,
bad_titles_count
def
parse_references_elements
(
ref_sect
,
kbs
):
"""Passed a complete reference section, process each line and attempt to
## identify and standardise individual citations within the line.
@param ref_sect: (list) of strings - each string in the list is a
reference line.
@param preprint_repnum_search_kb: (dictionary) - keyed by a tuple
containing the line-number of the pattern in the KB and the non-standard
category string. E.g.: (3, 'ASTRO PH'). Value is regexp pattern used to
search for that report-number.
@param preprint_repnum_standardised_categs: (dictionary) - keyed by non-
standard version of institutional report number, value is the
standardised version of that report number.
@param periodical_title_search_kb: (dictionary) - keyed by non-standard
title to search for, value is the compiled regexp pattern used to
search for that title.
@param standardised_periodical_titles: (dictionary) - keyed by non-
standard title to search for, value is the standardised version of that
title.
@param periodical_title_search_keys: (list) - ordered list of non-
standard titles to search for.
@return: (tuple) of 6 components:
( list -> of strings, each string is a MARC XML-ized reference
line.
integer -> number of fields of miscellaneous text found for the
record.
integer -> number of title citations found for the record.
integer -> number of institutional report-number citations found
for the record.
integer -> number of URL citations found for the record.
integer -> number of DOI's found
integer -> number of author groups found
dictionary -> The totals for each 'bad title' found in the reference
section.
)
"""
# a list to contain the processed reference lines:
citations
=
[]
# counters for extraction stats:
counts
=
{
'misc'
:
0
,
'title'
:
0
,
'reportnum'
:
0
,
'url'
:
0
,
'doi'
:
0
,
'auth_group'
:
0
,
}
# A dictionary to contain the total count of each 'bad title' found
# in the entire reference section:
bad_titles_count
=
{}
# process references line-by-line:
for
ref_line
in
ref_sect
:
citation_elements
,
line_marker
,
this_counts
,
bad_titles_count
=
\
parse_reference_line
(
ref_line
,
kbs
,
bad_titles_count
)
# Accumulate stats
counts
=
sum_2_dictionaries
(
counts
,
this_counts
)
citations
.
append
({
'elements'
:
citation_elements
,
'line_marker'
:
line_marker
})
# Return the list of processed reference lines:
return
citations
,
counts
,
bad_titles_count
def
parse_tagged_reference_line
(
line_marker
,
line
,
identified_dois
,
identified_urls
):
""" Given a single tagged reference line, convert it to its MARC-XML representation.
Try to find all tags and extract their contents and their types into corresponding
dictionary elements. Append each dictionary tag representation onto a list, which
is given to 'build_formatted_xml_citation()' where the correct xml output will be generated.
This method is dumb, with very few heuristics. It simply looks for tags, and makes dictionaries
from the data it finds in a tagged reference line.
@param line_marker: (string) The line marker for this single reference line (e.g. [19])
@param line: (string) The tagged reference line.
@param identified_dois: (list) a list of dois which were found in this line. The ordering of
dois corresponds to the ordering of tags in the line, reading from left to right.
@param identified_urls: (list) a list of urls which were found in this line. The ordering of
urls corresponds to the ordering of tags in the line, reading from left to right.
@param which format to use for references,
roughly "<title> <volume> <page>" or "<title>,<volume>,<page>"
@return xml_line: (string) the MARC-XML representation of the tagged reference line
@return count_*: (integer) the number of * (pieces of info) found in the reference line.
"""
count_misc
=
count_title
=
count_reportnum
=
count_url
=
count_doi
=
count_auth_group
=
0
processed_line
=
line
cur_misc_txt
=
u""
tag_match
=
re_tagged_citation
.
search
(
processed_line
)
# contains a list of dictionary entries of previously cited items
citation_elements
=
[]
# the last tag element found when working from left-to-right across the line
identified_citation_element
=
None
while
tag_match
is
not
None
:
# While there are tags inside this reference line...
tag_match_start
=
tag_match
.
start
()
tag_match_end
=
tag_match
.
end
()
tag_type
=
tag_match
.
group
(
1
)
cur_misc_txt
+=
processed_line
[
0
:
tag_match_start
]
# Catches both standard titles, and ibid's
if
tag_type
.
find
(
"JOURNAL"
)
!=
-
1
:
# This tag is an identified journal TITLE. It should be followed
# by VOLUME, YEAR and PAGE tags.
# See if the found title has been tagged as an ibid: <cds.JOURNALibid>
if
tag_match
.
group
(
'ibid'
):
is_ibid
=
True
closing_tag_length
=
len
(
CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID
)
idx_closing_tag
=
processed_line
.
find
(
CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID
,
tag_match_end
)
else
:
is_ibid
=
False
closing_tag_length
=
len
(
CFG_REFEXTRACT_MARKER_CLOSING_TITLE
)
# extract the title from the line:
idx_closing_tag
=
processed_line
.
find
(
CFG_REFEXTRACT_MARKER_CLOSING_TITLE
,
tag_match_end
)
if
idx_closing_tag
==
-
1
:
# no closing TITLE tag found - get rid of the solitary tag
processed_line
=
processed_line
[
tag_match_end
:]
identified_citation_element
=
None
else
:
# Closing tag was found:
# The title text to be used in the marked-up citation:
title_text
=
processed_line
[
tag_match_end
:
idx_closing_tag
]
# Now trim this matched title and its tags from the start of the line:
processed_line
=
processed_line
[
idx_closing_tag
+
closing_tag_length
:]
numeration_match
=
re_recognised_numeration_for_title_plus_series
.
search
(
processed_line
)
if
numeration_match
:
# recognised numeration immediately after the title - extract it:
reference_volume
=
numeration_match
.
group
(
'vol'
)
reference_year
=
numeration_match
.
group
(
'yr'
)
or
''
reference_page
=
numeration_match
.
group
(
'pg'
)
# This is used on two accounts:
# 1. To get the series char from the title, if no series was found with the numeration
# 2. To always remove any series character from the title match text
# series_from_title = re_series_from_title.search(title_text)
#
if
numeration_match
.
group
(
'series'
):
reference_volume
=
numeration_match
.
group
(
'series'
)
+
reference_volume
# Skip past the matched numeration in the working line:
processed_line
=
processed_line
[
numeration_match
.
end
():]
# 'id_ibid' saves whether THIS TITLE is an ibid or not. (True or False)
# 'extra_ibids' are there to hold ibid's without the word 'ibid', which
# come directly after this title
# i.e., they are recognised using title numeration instead of ibid notation
identified_citation_element
=
{
'type'
:
"JOURNAL"
,
'misc_txt'
:
cur_misc_txt
,
'title'
:
title_text
,
'volume'
:
reference_volume
,
'year'
:
reference_year
,
'page'
:
reference_page
,
'is_ibid'
:
is_ibid
,
'extra_ibids'
:
[]
}
count_title
+=
1
cur_misc_txt
=
u""
# Try to find IBID's after this title, on top of previously found titles that were
# denoted with the word 'IBID'. (i.e. look for IBID's without the word 'IBID' by
# looking at extra numeration after this title)
numeration_match
=
re_numeration_no_ibid_txt
.
match
(
processed_line
)
while
numeration_match
is
not
None
:
reference_volume
=
numeration_match
.
group
(
'vol'
)
reference_year
=
numeration_match
.
group
(
'yr'
)
reference_page
=
numeration_match
.
group
(
'pg'
)
if
numeration_match
.
group
(
'series'
):
reference_volume
=
numeration_match
.
group
(
'series'
)
+
reference_volume
# Skip past the matched numeration in the working line:
processed_line
=
processed_line
[
numeration_match
.
end
():]
# Takes the just found title text
identified_citation_element
[
'extra_ibids'
]
.
append
(
{
'type'
:
"JOURNAL"
,
'misc_txt'
:
""
,
'title'
:
title_text
,
'volume'
:
reference_volume
,
'year'
:
reference_year
,
'page'
:
reference_page
,
})
# Increment the stats counters:
count_title
+=
1
title_text
=
""
reference_volume
=
""
reference_year
=
""
reference_page
=
""
numeration_match
=
re_numeration_no_ibid_txt
.
match
(
processed_line
)
else
:
# No numeration was recognised after the title. Add the title into a MISC item instead:
cur_misc_txt
+=
"
%s
"
%
title_text
identified_citation_element
=
None
elif
tag_type
==
"REPORTNUMBER"
:
# This tag is an identified institutional report number:
# extract the institutional report-number from the line:
idx_closing_tag
=
processed_line
.
find
(
CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM
,
tag_match_end
)
# Sanity check - did we find a closing report-number tag?
if
idx_closing_tag
==
-
1
:
# no closing </cds.REPORTNUMBER> tag found - strip the opening tag and move past this
# recognised reportnumber as it is unreliable:
processed_line
=
processed_line
[
tag_match_end
:]
identified_citation_element
=
None
else
:
# closing tag was found
report_num
=
processed_line
[
tag_match_end
:
idx_closing_tag
]
# now trim this matched institutional report-number
# and its tags from the start of the line:
ending_tag_pos
=
idx_closing_tag
\
+
len
(
CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM
)
processed_line
=
processed_line
[
ending_tag_pos
:]
identified_citation_element
=
{
'type'
:
"REPORTNUMBER"
,
'misc_txt'
:
cur_misc_txt
,
'report_num'
:
report_num
}
count_reportnum
+=
1
cur_misc_txt
=
u""
elif
tag_type
==
"URL"
:
# This tag is an identified URL:
# From the "identified_urls" list, get this URL and its
# description string:
url_string
=
identified_urls
[
0
][
0
]
url_desc
=
identified_urls
[
0
][
1
]
# Now move past this "<cds.URL />"tag in the line:
processed_line
=
processed_line
[
tag_match_end
:]
# Delete the information for this URL from the start of the list
# of identified URLs:
identified_urls
[
0
:
1
]
=
[]
# Save the current misc text
identified_citation_element
=
{
'type'
:
"URL"
,
'misc_txt'
:
"
%s
"
%
cur_misc_txt
,
'url_string'
:
"
%s
"
%
url_string
,
'url_desc'
:
"
%s
"
%
url_desc
}
count_url
+=
1
cur_misc_txt
=
u""
elif
tag_type
==
"DOI"
:
# This tag is an identified DOI:
# From the "identified_dois" list, get this DOI and its
# description string:
doi_string
=
identified_dois
[
0
]
# Now move past this "<cds.CDS />"tag in the line:
processed_line
=
processed_line
[
tag_match_end
:]
# Remove DOI from the list of DOI strings
identified_dois
[
0
:
1
]
=
[]
# SAVE the current misc text
identified_citation_element
=
{
'type'
:
"DOI"
,
'misc_txt'
:
"
%s
"
%
cur_misc_txt
,
'doi_string'
:
"
%s
"
%
doi_string
}
# Increment the stats counters:
count_doi
+=
1
cur_misc_txt
=
u""
elif
tag_type
.
find
(
"AUTH"
)
!=
-
1
:
# This tag is an identified Author:
auth_type
=
""
# extract the title from the line:
if
tag_type
.
find
(
"stnd"
)
!=
-
1
:
auth_type
=
"stnd"
idx_closing_tag_nearest
=
processed_line
.
find
(
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND
,
tag_match_end
)
elif
tag_type
.
find
(
"etal"
)
!=
-
1
:
auth_type
=
"etal"
idx_closing_tag_nearest
=
processed_line
.
find
(
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL
,
tag_match_end
)
elif
tag_type
.
find
(
"incl"
)
!=
-
1
:
auth_type
=
"incl"
idx_closing_tag_nearest
=
processed_line
.
find
(
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL
,
tag_match_end
)
if
idx_closing_tag_nearest
==
-
1
:
# no closing </cds.AUTH****> tag found - strip the opening tag
# and move past it
processed_line
=
processed_line
[
tag_match_end
:]
identified_citation_element
=
None
else
:
auth_txt
=
processed_line
[
tag_match_end
:
idx_closing_tag_nearest
]
# Now move past the ending tag in the line:
processed_line
=
processed_line
[
idx_closing_tag_nearest
+
len
(
"</cds.AUTHxxxx>"
):]
#SAVE the current misc text
identified_citation_element
=
{
'type'
:
"AUTH"
,
'misc_txt'
:
"
%s
"
%
cur_misc_txt
,
'auth_txt'
:
"
%s
"
%
auth_txt
,
'auth_type'
:
"
%s
"
%
auth_type
}
# Increment the stats counters:
count_auth_group
+=
1
cur_misc_txt
=
u""
# These following tags may be found separately;
# They are usually found when a "JOURNAL" tag is hit
# (ONLY immediately afterwards, however)
# Sitting by themselves means they do not have
# an associated TITLE tag, and should be MISC
elif
tag_type
==
"SER"
:
# This tag is a SERIES tag; Since it was not preceeded by a TITLE
# tag, it is useless - strip the tag and put it into miscellaneous:
(
cur_misc_txt
,
processed_line
)
=
\
convert_unusable_tag_to_misc
(
processed_line
,
cur_misc_txt
,
tag_match_end
,
CFG_REFEXTRACT_MARKER_CLOSING_SERIES
)
identified_citation_element
=
None
elif
tag_type
==
"VOL"
:
# This tag is a VOLUME tag; Since it was not preceeded by a TITLE
# tag, it is useless - strip the tag and put it into miscellaneous:
(
cur_misc_txt
,
processed_line
)
=
\
convert_unusable_tag_to_misc
(
processed_line
,
cur_misc_txt
,
tag_match_end
,
CFG_REFEXTRACT_MARKER_CLOSING_VOLUME
)
identified_citation_element
=
None
elif
tag_type
==
"YR"
:
# This tag is a YEAR tag; Since it's not preceeded by TITLE and
# VOLUME tags, it is useless - strip the tag and put the contents
# into miscellaneous:
(
cur_misc_txt
,
processed_line
)
=
\
convert_unusable_tag_to_misc
(
processed_line
,
cur_misc_txt
,
tag_match_end
,
CFG_REFEXTRACT_MARKER_CLOSING_YEAR
)
identified_citation_element
=
None
elif
tag_type
==
"PG"
:
# This tag is a PAGE tag; Since it's not preceeded by TITLE,
# VOLUME and YEAR tags, it is useless - strip the tag and put the
# contents into miscellaneous:
(
cur_misc_txt
,
processed_line
)
=
\
convert_unusable_tag_to_misc
(
processed_line
,
cur_misc_txt
,
tag_match_end
,
CFG_REFEXTRACT_MARKER_CLOSING_PAGE
)
identified_citation_element
=
None
elif
tag_type
==
"QUOTED"
:
identified_citation_element
,
processed_line
,
cur_misc_txt
=
\
map_tag_to_subfield
(
tag_type
,
processed_line
[
tag_match_end
:],
cur_misc_txt
,
'title'
)
elif
tag_type
==
"ISBN"
:
identified_citation_element
,
processed_line
,
cur_misc_txt
=
\
map_tag_to_subfield
(
tag_type
,
processed_line
[
tag_match_end
:],
cur_misc_txt
,
tag_type
)
elif
tag_type
==
"PUBLISHER"
:
identified_citation_element
,
processed_line
,
cur_misc_txt
=
\
map_tag_to_subfield
(
tag_type
,
processed_line
[
tag_match_end
:],
cur_misc_txt
,
'publisher'
)
if
identified_citation_element
:
# Append the found tagged data and current misc text
citation_elements
.
append
(
identified_citation_element
)
identified_citation_element
=
None
# Look for the next tag in the processed line:
tag_match
=
re_tagged_citation
.
search
(
processed_line
)
# place any remaining miscellaneous text into the
# appropriate MARC XML fields:
cur_misc_txt
+=
processed_line
# This MISC element will hold the entire citation in the event
# that no tags were found.
if
len
(
cur_misc_txt
.
strip
(
" .;,"
))
>
0
:
# Increment the stats counters:
count_misc
+=
1
identified_citation_element
=
{
'type'
:
"MISC"
,
'misc_txt'
:
"
%s
"
%
cur_misc_txt
,
}
citation_elements
.
append
(
identified_citation_element
)
return
(
citation_elements
,
line_marker
,
{
'misc'
:
count_misc
,
'title'
:
count_title
,
'reportnum'
:
count_reportnum
,
'url'
:
count_url
,
'doi'
:
count_doi
,
'auth_group'
:
count_auth_group
})
def
map_tag_to_subfield
(
tag_type
,
line
,
cur_misc_txt
,
dest
):
"""Create a new reference element"""
closing_tag
=
'</cds.
%s
>'
%
tag_type
# extract the institutional report-number from the line:
idx_closing_tag
=
line
.
find
(
closing_tag
)
# Sanity check - did we find a closing tag?
if
idx_closing_tag
==
-
1
:
# no closing </cds.TAG> tag found - strip the opening tag and move past this
# recognised reportnumber as it is unreliable:
identified_citation_element
=
None
line
=
line
[
len
(
'<cds.
%s
>'
%
tag_type
):]
else
:
tag_content
=
line
[:
idx_closing_tag
]
identified_citation_element
=
{
'type'
:
tag_type
,
'misc_txt'
:
cur_misc_txt
,
dest
:
tag_content
}
ending_tag_pos
=
idx_closing_tag
+
len
(
closing_tag
)
line
=
line
[
ending_tag_pos
:]
cur_misc_txt
=
u""
return
identified_citation_element
,
line
,
cur_misc_txt
def
convert_unusable_tag_to_misc
(
line
,
misc_text
,
tag_match_end
,
closing_tag
):
"""Function to remove an unwanted, tagged, citation item from a reference
line. The tagged item itself is put into the miscellaneous text variable;
the data up to the closing tag is then trimmed from the beginning of the
working line. For example, the following working line:
Example, AN. Testing software; <cds.YR>(2001)</cds.YR>, CERN, Geneva.
...would be trimmed down to:
, CERN, Geneva.
...And the Miscellaneous text taken from the start of the line would be:
Example, AN. Testing software; (2001)
...(assuming that the details of <cds.YR> and </cds.YR> were passed to
the function).
@param line: (string) - the reference line.
@param misc_text: (string) - the variable containing the miscellaneous
text recorded so far.
@param tag_match_end: (integer) - the index of the end of the opening tag
in the line.
@param closing_tag: (string) - the closing tag to look for in the line
(e.g. </cds.YR>).
@return: (tuple) - containing misc_text (string) and line (string)
"""
# extract the tagged information:
idx_closing_tag
=
line
.
find
(
closing_tag
,
tag_match_end
)
# Sanity check - did we find a closing tag?
if
idx_closing_tag
==
-
1
:
# no closing tag found - strip the opening tag and move past this
# recognised item as it is unusable:
line
=
line
[
tag_match_end
:]
else
:
# closing tag was found
misc_text
+=
line
[
tag_match_end
:
idx_closing_tag
]
# now trim the matched item and its tags from the start of the line:
line
=
line
[
idx_closing_tag
+
len
(
closing_tag
):]
return
(
misc_text
,
line
)
# Tasks related to extraction of reference section from full-text:
# ----> 1. Removing page-breaks, headers and footers before
# searching for reference section:
# ----> 2. Finding reference section in full-text:
# ----> 3. Found reference section - now take out lines and rebuild them:
def
remove_leading_garbage_lines_from_reference_section
(
ref_sectn
):
"""Sometimes, the first lines of the extracted references are completely
blank or email addresses. These must be removed as they are not
references.
@param ref_sectn: (list) of strings - the reference section lines
@return: (list) of strings - the reference section without leading
blank lines or email addresses.
"""
p_email
=
re
.
compile
(
ur'^\s*e\-?mail'
,
re
.
UNICODE
)
while
ref_sectn
and
(
ref_sectn
[
0
]
.
isspace
()
or
p_email
.
match
(
ref_sectn
[
0
])):
ref_sectn
.
pop
(
0
)
return
ref_sectn
# ----> Glue - logic for finding and extracting reference section:
# Tasks related to conversion of full-text to plain-text:
def
get_plaintext_document_body
(
fpath
,
keep_layout
=
False
):
"""Given a file-path to a full-text, return a list of unicode strings
whereby each string is a line of the fulltext.
In the case of a plain-text document, this simply means reading the
contents in from the file. In the case of a PDF/PostScript however,
this means converting the document to plaintext.
@param fpath: (string) - the path to the fulltext file
@return: (list) of strings - each string being a line in the document.
"""
textbody
=
[]
status
=
0
if
os
.
access
(
fpath
,
os
.
F_OK
|
os
.
R_OK
):
# filepath OK - attempt to extract references:
# get file type:
cmd_pdftotext
=
[
CFG_PATH_GFILE
,
fpath
]
pipe_pdftotext
=
subprocess
.
Popen
(
cmd_pdftotext
,
stdout
=
subprocess
.
PIPE
)
res_gfile
=
pipe_pdftotext
.
stdout
.
read
()
if
(
res_gfile
.
lower
()
.
find
(
"text"
)
!=
-
1
)
and
\
(
res_gfile
.
lower
()
.
find
(
"pdf"
)
==
-
1
):
# plain-text file: don't convert - just read in:
f
=
open
(
fpath
,
"r"
)
try
:
textbody
=
[
line
.
decode
(
"utf-8"
)
for
line
in
f
.
readlines
()]
finally
:
f
.
close
()
elif
(
res_gfile
.
lower
()
.
find
(
"pdf"
)
!=
-
1
)
or
\
(
res_gfile
.
lower
()
.
find
(
"pdfa"
)
!=
-
1
):
# convert from PDF
(
textbody
,
status
)
=
convert_PDF_to_plaintext
(
fpath
,
keep_layout
)
else
:
# invalid format
status
=
1
else
:
# filepath not OK
status
=
1
return
(
textbody
,
status
)
def
build_xml_references
(
citations
):
"""Build marc xml from a references list
Transform the reference elements into marc xml
"""
xml_references
=
[]
for
c
in
citations
:
# Now, run the method which will take as input:
# 1. A list of lists of dictionaries, where each dictionary is a piece
# of citation information corresponding to a tag in the citation.
# 2. The line marker for this entire citation line (mulitple citation
# 'finds' inside a single citation will use the same marker value)
# The resulting xml line will be a properly marked up form of the
# citation. It will take into account authors to try and split up
# references which should be read as two SEPARATE ones.
xml_lines
=
build_xml_citations
(
c
[
'elements'
],
c
[
'line_marker'
])
xml_references
.
extend
(
xml_lines
)
return
xml_references
def
parse_references
(
reference_lines
,
recid
=
1
,
kbs_files
=
None
):
"""Parse a list of references
Given a list of raw reference lines (list of strings),
output the MARC-XML content extracted version
"""
# RefExtract knowledge bases
kbs
=
get_kbs
(
custom_kbs_files
=
kbs_files
)
# Identify journal titles, report numbers, URLs, DOIs, and authors...
(
processed_references
,
counts
,
dummy_bad_titles_count
)
=
\
parse_references_elements
(
reference_lines
,
kbs
)
# Generate marc xml using the elements list
xml_out
=
build_xml_references
(
processed_references
)
# Generate the xml string to be outputted
return
create_xml_record
(
counts
,
recid
,
xml_out
)
Event Timeline
Log In to Comment