Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F101648288
bom_textdoc.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Feb 12, 10:41
Size
6 KB
Mime Type
text/x-python
Expires
Fri, Feb 14, 10:41 (2 d)
Engine
blob
Format
Raw Data
Handle
24206003
Attached To
R3600 invenio-infoscience
bom_textdoc.py
View Options
## This file is part of Invenio.
## Copyright (C) 2007, 2008, 2009, 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""BibObject Module providing BibObject prividing features for documents containing text (not necessarily as the main part of the content)"""
from
invenio.bibdocfile
import
BibDoc
,
InvenioBibDocFileError
from
invenio.dbquery
import
run_sql
from
datetime
import
datetime
from
invenio.errorlib
import
register_exception
import
os
class
BibTextDoc
(
BibDoc
):
def
get_text
(
self
,
version
=
None
):
"""
@param version: the requested version. If not set, the latest version
will be used.
@type version: integer
@return: the textual content corresponding to the specified version
of the document.
@rtype: string
"""
if
version
is
None
:
version
=
self
.
get_latest_version
()
if
self
.
has_text
(
version
):
return
open
(
os
.
path
.
join
(
self
.
basedir
,
'.text;
%i
'
%
version
))
.
read
()
else
:
return
""
def
get_text_path
(
self
,
version
=
None
):
"""
@param version: the requested version. If not set, the latest version
will be used.
@type version: int
@return: the full path to the textual content corresponding to the specified version
of the document.
@rtype: string
"""
if
version
is
None
:
version
=
self
.
get_latest_version
()
if
self
.
has_text
(
version
):
return
os
.
path
.
join
(
self
.
basedir
,
'.text;
%i
'
%
version
)
else
:
return
""
def
extract_text
(
self
,
version
=
None
,
perform_ocr
=
False
,
ln
=
'en'
):
"""
Try what is necessary to extract the textual information of a document.
@param version: the version of the document for which text is required.
If not specified the text will be retrieved from the last version.
@type version: integer
@param perform_ocr: whether to perform OCR.
@type perform_ocr: bool
@param ln: a two letter language code to give as a hint to the OCR
procedure.
@type ln: string
@raise InvenioBibDocFileError: in case of error.
@note: the text is extracted and cached for later use. Use L{get_text}
to retrieve it.
"""
from
invenio.websubmit_file_converter
import
get_best_format_to_extract_text_from
,
convert_file
,
InvenioWebSubmitFileConverterError
if
version
is
None
:
version
=
self
.
get_latest_version
()
docfiles
=
self
.
list_version_files
(
version
)
## We try to extract text only from original or OCRed documents.
filenames
=
[
docfile
.
get_full_path
()
for
docfile
in
docfiles
if
'CONVERTED'
not
in
docfile
.
flags
or
'OCRED'
in
docfile
.
flags
]
try
:
filename
=
get_best_format_to_extract_text_from
(
filenames
)
except
InvenioWebSubmitFileConverterError
:
## We fall back on considering all the documents
filenames
=
[
docfile
.
get_full_path
()
for
docfile
in
docfiles
]
try
:
filename
=
get_best_format_to_extract_text_from
(
filenames
)
except
InvenioWebSubmitFileConverterError
:
open
(
os
.
path
.
join
(
self
.
basedir
,
'.text;
%i
'
%
version
),
'w'
)
.
write
(
''
)
return
try
:
convert_file
(
filename
,
os
.
path
.
join
(
self
.
basedir
,
'.text;
%i
'
%
version
),
'.txt'
,
perform_ocr
=
perform_ocr
,
ln
=
ln
)
if
version
==
self
.
get_latest_version
():
run_sql
(
"UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=
%s
"
,
(
self
.
id
,
))
except
InvenioWebSubmitFileConverterError
,
e
:
register_exception
(
alert_admin
=
True
,
prefix
=
"Error in extracting text from bibdoc
%i
, version
%i
"
%
(
self
.
id
,
version
))
raise
InvenioBibDocFileError
,
str
(
e
)
def
pdf_a_p
(
self
):
"""
@return: True if this document contains a PDF in PDF/A format.
@rtype: bool"""
return
self
.
has_flag
(
'PDF/A'
,
'pdf'
)
def
has_text
(
self
,
require_up_to_date
=
False
,
version
=
None
):
"""
Return True if the text of this document has already been extracted.
@param require_up_to_date: if True check the text was actually
extracted after the most recent format of the given version.
@type require_up_to_date: bool
@param version: a version for which the text should have been
extracted. If not specified the latest version is considered.
@type version: integer
@return: True if the text has already been extracted.
@rtype: bool
"""
if
version
is
None
:
version
=
self
.
get_latest_version
()
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
basedir
,
'.text;
%i
'
%
version
)):
if
not
require_up_to_date
:
return
True
else
:
docfiles
=
self
.
list_version_files
(
version
)
text_md
=
datetime
.
fromtimestamp
(
os
.
path
.
getmtime
(
os
.
path
.
join
(
self
.
basedir
,
'.text;
%i
'
%
version
)))
for
docfile
in
docfiles
:
if
text_md
<=
docfile
.
md
:
return
False
return
True
return
False
def
__repr__
(
self
):
return
'BibTextDoc(
%s
,
%s
,
%s
)'
%
(
repr
(
self
.
id
),
repr
(
self
.
doctype
),
repr
(
self
.
human_readable
))
def
supports
(
doctype
,
extensions
):
return
doctype
==
"Fulltext"
or
reduce
(
lambda
x
,
y
:
x
or
y
.
startswith
(
".pdf"
)
or
y
.
startswith
(
".ps"
)
,
extensions
,
False
)
def
create_instance
(
docid
=
None
,
doctype
=
'Main'
,
human_readable
=
False
,
# pylint: disable=W0613
initial_data
=
None
):
return
BibTextDoc
(
docid
=
docid
,
human_readable
=
human_readable
,
initial_data
=
initial_data
)
Event Timeline
Log In to Comment