Page MenuHomec4science

refextract_find.py
No OneTemporary

File Metadata

Created
Sun, Aug 25, 22:29

refextract_find.py

# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Finding the reference section from the fulltext"""
import re
from invenio.docextract_utils import write_message
from invenio.refextract_re import \
get_reference_section_title_patterns, \
get_reference_line_numeration_marker_patterns, \
regex_match_list, \
get_post_reference_section_title_patterns, \
get_post_reference_section_keyword_patterns, \
re_reference_line_bracket_markers, \
re_reference_line_dot_markers, \
re_reference_line_number_markers, \
re_num
def find_reference_section(docbody):
"""Search in document body for its reference section.
More precisely, find
the first line of the reference section. Effectively, the function starts
at the end of a document and works backwards, line-by-line, looking for
the title of a reference section. It stops when (if) it finds something
that it considers to be the first line of a reference section.
@param docbody: (list) of strings - the full document body.
@return: (dictionary) :
{ 'start_line' : (integer) - index in docbody of 1st reference line,
'title_string' : (string) - title of the reference section.
'marker' : (string) - the marker of the first reference line,
'marker_pattern' : (string) - regexp string used to find the marker,
'title_marker_same_line' : (integer) - flag to indicate whether the
reference section title was on the same
line as the first reference line's
marker or not. 1 if it was; 0 if not.
}
Much of this information is used by later functions to rebuild
a reference section.
-- OR --
(None) - when the reference section could not be found.
"""
ref_details = None
title_patterns = get_reference_section_title_patterns()
# Try to find refs section title:
for reversed_index, line in enumerate(reversed(docbody)):
title_match = regex_match_list(line, title_patterns)
if title_match:
title = title_match.group('title')
index = len(docbody) - 1 - reversed_index
temp_ref_details, found_title = find_numeration(docbody[index:index+3], title)
if temp_ref_details:
if ref_details and 'title' in ref_details \
and ref_details['title'] \
and not temp_ref_details['title']:
continue
if ref_details and 'marker' in ref_details \
and ref_details['marker'] \
and not temp_ref_details['marker']:
continue
ref_details = temp_ref_details
ref_details['start_line'] = index
ref_details['title_string'] = title
if found_title:
break
return ref_details
def find_numeration_in_body(docbody):
marker_patterns = get_reference_line_numeration_marker_patterns()
ref_details = None
found_title = False
for line in docbody:
# Move past blank lines
if line.isspace():
continue
# Is this line numerated like a reference line?
mark_match = regex_match_list(line, marker_patterns)
if mark_match:
mark = mark_match.group('mark')
mk_ptn = mark_match.re.pattern
ref_details = {
'marker': mark,
'marker_pattern': mk_ptn,
'title_marker_same_line': False,
}
# Check if it's the first reference
# Something like [1] or (1), etc.
m_num = re_num.search(mark)
if m_num and m_num.group(0) == '1':
# 1st ref truly found
break
else:
# No numeration
ref_details = {
'title_marker_same_line': False,
'marker': None,
'marker_pattern': None,
}
return ref_details, found_title
def find_numeration_in_title(docbody, title):
ref_details = None
found_title = False
try:
first_line = docbody[0]
except IndexError:
return ref_details, found_title
# Need to escape to avoid problems like 'References['
title = re.escape(title)
mk_with_title_ptns = \
get_reference_line_numeration_marker_patterns(title)
mk_with_title_match = \
regex_match_list(first_line, mk_with_title_ptns)
if mk_with_title_match:
mk = mk_with_title_match.group('mark')
mk_ptn = mk_with_title_match.re.pattern
m_num = re_num.search(mk)
if m_num and m_num.group(0) == '1':
# Mark found
found_title = True
ref_details = {
'marker': mk,
'marker_pattern': mk_ptn,
'title_marker_same_line': True
}
else:
ref_details = {
'marker': mk,
'marker_pattern': mk_ptn,
'title_marker_same_line': True
}
return ref_details, found_title
def find_numeration(docbody, title):
"""Find numeration pattern
1st try to find numeration in the title
e.g.
References [4] Riotto...
2nd find the numeration alone in the line after the title
e.g.
References
1
Riotto
3rnd find the numeration in the following line
e.g.
References
[1] Riotto
"""
ref_details, found_title = find_numeration_in_title(docbody, title)
if not ref_details:
ref_details, found_title = find_numeration_in_body(docbody)
return ref_details, found_title
def find_reference_section_no_title_via_brackets(docbody):
"""This function would generally be used when it was not possible to locate
the start of a document's reference section by means of its title.
Instead, this function will look for reference lines that have numeric
markers of the format [1], [2], etc.
@param docbody: (list) of strings -each string is a line in the document.
@return: (dictionary) :
{ 'start_line' : (integer) - index in docbody of 1st reference line,
'title_string' : (None) - title of the reference section
(None since no title),
'marker' : (string) - the marker of the first reference line,
'marker_pattern' : (string) - the regexp string used to find the
marker,
'title_marker_same_line' : (integer) 0 - to signal title not on same
line as marker.
}
Much of this information is used by later functions to rebuild
a reference section.
-- OR --
(None) - when the reference section could not be found.
"""
marker_patterns = [re_reference_line_bracket_markers]
return find_reference_section_no_title_generic(docbody, marker_patterns)
def find_reference_section_no_title_via_dots(docbody):
"""This function would generally be used when it was not possible to locate
the start of a document's reference section by means of its title.
Instead, this function will look for reference lines that have numeric
markers of the format 1., 2., etc.
@param docbody: (list) of strings -each string is a line in the document.
@return: (dictionary) :
{ 'start_line' : (integer) - index in docbody of 1st reference line,
'title_string' : (None) - title of the reference section
(None since no title),
'marker' : (string) - the marker of the first reference line,
'marker_pattern' : (string) - the regexp string used to find the
marker,
'title_marker_same_line' : (integer) 0 - to signal title not on same
line as marker.
}
Much of this information is used by later functions to rebuild
a reference section.
-- OR --
(None) - when the reference section could not be found.
"""
marker_patterns = [re_reference_line_dot_markers]
return find_reference_section_no_title_generic(docbody, marker_patterns)
def find_reference_section_no_title_via_numbers(docbody):
"""This function would generally be used when it was not possible to locate
the start of a document's reference section by means of its title.
Instead, this function will look for reference lines that have numeric
markers of the format 1, 2, etc.
@param docbody: (list) of strings -each string is a line in the document.
@return: (dictionary) :
{ 'start_line' : (integer) - index in docbody of 1st reference line,
'title_string' : (None) - title of the reference section
(None since no title),
'marker' : (string) - the marker of the first reference line,
'marker_pattern' : (string) - the regexp string used to find the
marker,
'title_marker_same_line' : (integer) 0 - to signal title not on same
line as marker.
}
Much of this information is used by later functions to rebuild
a reference section.
-- OR --
(None) - when the reference section could not be found.
"""
marker_patterns = [re_reference_line_number_markers]
return find_reference_section_no_title_generic(docbody, marker_patterns)
def find_reference_section_no_title_generic(docbody, marker_patterns):
"""This function would generally be used when it was not possible to locate
the start of a document's reference section by means of its title.
Instead, this function will look for reference lines that have numeric
markers of the format [1], [2], {1}, {2}, etc.
@param docbody: (list) of strings -each string is a line in the document.
@return: (dictionary) :
{ 'start_line' : (integer) - index in docbody of 1st reference line,
'title_string' : (None) - title of the reference section
(None since no title),
'marker' : (string) - the marker of the first reference line,
'marker_pattern' : (string) - the regexp string used to find the
marker,
'title_marker_same_line' : (integer) 0 - to signal title not on same
line as marker.
}
Much of this information is used by later functions to rebuild
a reference section.
-- OR --
(None) - when the reference section could not be found.
"""
if not docbody:
return None
ref_start_line = ref_line_marker = None
# try to find first reference line in the reference section:
found_ref_sect = False
for reversed_index, line in enumerate(reversed(docbody)):
mark_match = regex_match_list(line.strip(), marker_patterns)
if mark_match and mark_match.group('marknum') == '1':
# Get marker recognition pattern:
mark_pattern = mark_match.re.pattern
# Look for [2] in next 10 lines:
next_test_lines = 10
index = len(docbody) - reversed_index
zone_to_check = docbody[index:index+next_test_lines]
if len(zone_to_check) < 5:
# We found a 1 towards the end, we assume
# we only have one reference
found = True
else:
# Check for number 2
found = False
for l in zone_to_check:
mark_match2 = regex_match_list(l.strip(), marker_patterns)
if mark_match2 and mark_match2.group('marknum') == '2':
found = True
break
if found:
# Found next reference line:
found_ref_sect = True
ref_start_line = len(docbody) - 1 - reversed_index
ref_line_marker = mark_match.group('mark')
ref_line_marker_pattern = mark_pattern
break
if found_ref_sect:
ref_sectn_details = {
'start_line' : ref_start_line,
'title_string' : None,
'marker' : ref_line_marker.strip(),
'marker_pattern' : ref_line_marker_pattern,
'title_marker_same_line' : False,
}
else:
# didn't manage to find the reference section
ref_sectn_details = None
return ref_sectn_details
def find_end_of_reference_section(docbody,
ref_start_line,
ref_line_marker,
ref_line_marker_ptn):
"""Given that the start of a document's reference section has already been
recognised, this function is tasked with finding the line-number in the
document of the last line of the reference section.
@param docbody: (list) of strings - the entire plain-text document body.
@param ref_start_line: (integer) - the index in docbody of the first line
of the reference section.
@param ref_line_marker: (string) - the line marker of the first reference
line.
@param ref_line_marker_ptn: (string) - the pattern used to search for a
reference line marker.
@return: (integer) - index in docbody of the last reference line
-- OR --
(None) - if ref_start_line was invalid.
"""
section_ended = False
x = ref_start_line
if type(x) is not int or x < 0 or \
x > len(docbody) or len(docbody) < 1:
# The provided 'first line' of the reference section was invalid.
# Either it was out of bounds in the document body, or it was not a
# valid integer.
# Can't safely find end of refs with this info - quit.
return None
# Get patterns for testing line:
t_patterns = get_post_reference_section_title_patterns()
kw_patterns = get_post_reference_section_keyword_patterns()
if None not in (ref_line_marker, ref_line_marker_ptn):
mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)]
else:
mk_patterns = get_reference_line_numeration_marker_patterns()
current_reference_count = 0
while x < len(docbody) and not section_ended:
# save the reference count
num_match = regex_match_list(docbody[x].strip(), mk_patterns)
if num_match:
try:
current_reference_count = int(num_match.group('marknum'))
except (ValueError, IndexError):
# non numerical references marking
pass
# look for a likely section title that would follow a reference section:
end_match = regex_match_list(docbody[x].strip(), t_patterns)
if not end_match:
# didn't match a section title - try looking for keywords that
# suggest the end of a reference section:
end_match = regex_match_list(docbody[x].strip(), kw_patterns)
else:
# Is it really the end of the reference section? Check within the next
# 5 lines for other reference numeration markers:
y = x + 1
line_found = False
while y < x + 200 and y < len(docbody) and not line_found:
num_match = regex_match_list(docbody[y].strip(), mk_patterns)
if num_match and not num_match.group(0).isdigit():
try:
num = int(num_match.group('marknum'))
if current_reference_count + 1 == num:
line_found = True
except ValueError:
# We have the marknum index so it is
# numeric pattern for references like
# [1], [2] but this match is not a number
pass
except IndexError:
# We have a non numerical references marking
# we don't check for a number continuity
line_found = True
y += 1
if not line_found:
# No ref line found-end section
section_ended = True
if not section_ended:
# Does this & the next 5 lines simply contain numbers? If yes, it's
# probably the axis scale of a graph in a fig. End refs section
digit_test_str = docbody[x].replace(" ", "").\
replace(".", "").\
replace("-", "").\
replace("+", "").\
replace(u"\u00D7", "").\
replace(u"\u2212", "").\
strip()
if len(digit_test_str) > 10 and digit_test_str.isdigit():
# The line contains only digits and is longer than 10 chars:
y = x + 1
digit_lines = 4
num_digit_lines = 1
while y < x + digit_lines and y < len(docbody):
digit_test_str = docbody[y].replace(" ", "").\
replace(".", "").\
replace("-", "").\
replace("+", "").\
replace(u"\u00D7", "").\
replace(u"\u2212", "").\
strip()
if len(digit_test_str) > 10 and digit_test_str.isdigit():
num_digit_lines += 1
elif len(digit_test_str) == 0:
# This is a blank line. Don't count it, to accommodate
# documents that are double-line spaced:
digit_lines += 1
y = y + 1
if num_digit_lines == digit_lines:
section_ended = True
x += 1
return x - 1
def get_reference_section_beginning(fulltext):
sect_start = {'start_line' : None,
'end_line' : None,
'title_string' : None,
'marker_pattern' : None,
'marker' : None,
'how_found_start': None,
}
## Find start of refs section:
sect_start = find_reference_section(fulltext)
if sect_start is not None:
sect_start['how_found_start'] = 1
else:
## No references found - try with no title option
sect_start = find_reference_section_no_title_via_brackets(fulltext)
if sect_start is not None:
sect_start['how_found_start'] = 2
## Try weaker set of patterns if needed
if sect_start is None:
## No references found - try with no title option (with weaker patterns..)
sect_start = find_reference_section_no_title_via_dots(fulltext)
if sect_start is not None:
sect_start['how_found_start'] = 3
if sect_start is None:
## No references found - try with no title option (with even weaker patterns..)
sect_start = find_reference_section_no_title_via_numbers(fulltext)
if sect_start is not None:
sect_start['how_found_start'] = 4
if sect_start:
write_message('* title %r' % sect_start['title_string'], verbose=3)
write_message('* marker %r' % sect_start['marker'], verbose=3)
write_message('* title_marker_same_line %s' \
% sect_start['title_marker_same_line'], verbose=3)
else:
write_message('* could not find references section', verbose=3)
return sect_start

Event Timeline