Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91345550
pdf.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Nov 10, 04:56
Size
16 KB
Mime Type
text/x-python
Expires
Tue, Nov 12, 04:56 (2 d)
Engine
blob
Format
Raw Data
Handle
22246810
Attached To
R3600 invenio-infoscience
pdf.py
View Options
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
When a document is converted to plain-text from PDF,
certain characters may result in the plain-text, that are
either unwanted, or broken. These characters need to be corrected
or removed. Examples are, certain control characters that would
be illegal in XML and must be removed; TeX ligatures (etc); broken
accents such as umlauts on letters that must be corrected.
This function returns a dictionary of (unwanted) characters to look
for and the characters that should be used to replace them.
@return: (dictionary) - { seek -> replace, } or charsacters to
replace in plain-text.
"""
import
re
import
subprocess
from
six
import
iteritems
from
invenio.config
import
CFG_PATH_PDFTOTEXT
from
invenio.legacy.docextract.utils
import
write_message
# a dictionary of undesirable characters and their replacements:
UNDESIRABLE_CHAR_REPLACEMENTS
=
{
# Control characters not allowed in XML:
u'
\u2028
'
:
u""
,
u'
\u2029
'
:
u""
,
u'
\u202A
'
:
u""
,
u'
\u202B
'
:
u""
,
u'
\u202C
'
:
u""
,
u'
\u202D
'
:
u""
,
u'
\u202E
'
:
u""
,
u'
\u206A
'
:
u""
,
u'
\u206B
'
:
u""
,
u'
\u206C
'
:
u""
,
u'
\u206D
'
:
u""
,
u'
\u206E
'
:
u""
,
u'
\u206F
'
:
u""
,
u'
\uFFF9
'
:
u""
,
u'
\uFFFA
'
:
u""
,
u'
\uFFFB
'
:
u""
,
u'
\uFFFC
'
:
u""
,
u'
\uFEFF
'
:
u""
,
# Remove the result of a bad UTF-8 character
u'
\uFFFF
'
:
u""
,
# Language Tag Code Points:
u"
\U000E0000
"
:
u""
,
u"
\U000E0001
"
:
u""
,
u"
\U000E0002
"
:
u""
,
u"
\U000E0003
"
:
u""
,
u"
\U000E0004
"
:
u""
,
u"
\U000E0005
"
:
u""
,
u"
\U000E0006
"
:
u""
,
u"
\U000E0007
"
:
u""
,
u"
\U000E0008
"
:
u""
,
u"
\U000E0009
"
:
u""
,
u"
\U000E000A
"
:
u""
,
u"
\U000E000B
"
:
u""
,
u"
\U000E000C
"
:
u""
,
u"
\U000E000D
"
:
u""
,
u"
\U000E000E
"
:
u""
,
u"
\U000E000F
"
:
u""
,
u"
\U000E0010
"
:
u""
,
u"
\U000E0011
"
:
u""
,
u"
\U000E0012
"
:
u""
,
u"
\U000E0013
"
:
u""
,
u"
\U000E0014
"
:
u""
,
u"
\U000E0015
"
:
u""
,
u"
\U000E0016
"
:
u""
,
u"
\U000E0017
"
:
u""
,
u"
\U000E0018
"
:
u""
,
u"
\U000E0019
"
:
u""
,
u"
\U000E001A
"
:
u""
,
u"
\U000E001B
"
:
u""
,
u"
\U000E001C
"
:
u""
,
u"
\U000E001D
"
:
u""
,
u"
\U000E001E
"
:
u""
,
u"
\U000E001F
"
:
u""
,
u"
\U000E0020
"
:
u""
,
u"
\U000E0021
"
:
u""
,
u"
\U000E0022
"
:
u""
,
u"
\U000E0023
"
:
u""
,
u"
\U000E0024
"
:
u""
,
u"
\U000E0025
"
:
u""
,
u"
\U000E0026
"
:
u""
,
u"
\U000E0027
"
:
u""
,
u"
\U000E0028
"
:
u""
,
u"
\U000E0029
"
:
u""
,
u"
\U000E002A
"
:
u""
,
u"
\U000E002B
"
:
u""
,
u"
\U000E002C
"
:
u""
,
u"
\U000E002D
"
:
u""
,
u"
\U000E002E
"
:
u""
,
u"
\U000E002F
"
:
u""
,
u"
\U000E0030
"
:
u""
,
u"
\U000E0031
"
:
u""
,
u"
\U000E0032
"
:
u""
,
u"
\U000E0033
"
:
u""
,
u"
\U000E0034
"
:
u""
,
u"
\U000E0035
"
:
u""
,
u"
\U000E0036
"
:
u""
,
u"
\U000E0037
"
:
u""
,
u"
\U000E0038
"
:
u""
,
u"
\U000E0039
"
:
u""
,
u"
\U000E003A
"
:
u""
,
u"
\U000E003B
"
:
u""
,
u"
\U000E003C
"
:
u""
,
u"
\U000E003D
"
:
u""
,
u"
\U000E003E
"
:
u""
,
u"
\U000E003F
"
:
u""
,
u"
\U000E0040
"
:
u""
,
u"
\U000E0041
"
:
u""
,
u"
\U000E0042
"
:
u""
,
u"
\U000E0043
"
:
u""
,
u"
\U000E0044
"
:
u""
,
u"
\U000E0045
"
:
u""
,
u"
\U000E0046
"
:
u""
,
u"
\U000E0047
"
:
u""
,
u"
\U000E0048
"
:
u""
,
u"
\U000E0049
"
:
u""
,
u"
\U000E004A
"
:
u""
,
u"
\U000E004B
"
:
u""
,
u"
\U000E004C
"
:
u""
,
u"
\U000E004D
"
:
u""
,
u"
\U000E004E
"
:
u""
,
u"
\U000E004F
"
:
u""
,
u"
\U000E0050
"
:
u""
,
u"
\U000E0051
"
:
u""
,
u"
\U000E0052
"
:
u""
,
u"
\U000E0053
"
:
u""
,
u"
\U000E0054
"
:
u""
,
u"
\U000E0055
"
:
u""
,
u"
\U000E0056
"
:
u""
,
u"
\U000E0057
"
:
u""
,
u"
\U000E0058
"
:
u""
,
u"
\U000E0059
"
:
u""
,
u"
\U000E005A
"
:
u""
,
u"
\U000E005B
"
:
u""
,
u"
\U000E005C
"
:
u""
,
u"
\U000E005D
"
:
u""
,
u"
\U000E005E
"
:
u""
,
u"
\U000E005F
"
:
u""
,
u"
\U000E0060
"
:
u""
,
u"
\U000E0061
"
:
u""
,
u"
\U000E0062
"
:
u""
,
u"
\U000E0063
"
:
u""
,
u"
\U000E0064
"
:
u""
,
u"
\U000E0065
"
:
u""
,
u"
\U000E0066
"
:
u""
,
u"
\U000E0067
"
:
u""
,
u"
\U000E0068
"
:
u""
,
u"
\U000E0069
"
:
u""
,
u"
\U000E006A
"
:
u""
,
u"
\U000E006B
"
:
u""
,
u"
\U000E006C
"
:
u""
,
u"
\U000E006D
"
:
u""
,
u"
\U000E006E
"
:
u""
,
u"
\U000E006F
"
:
u""
,
u"
\U000E0070
"
:
u""
,
u"
\U000E0071
"
:
u""
,
u"
\U000E0072
"
:
u""
,
u"
\U000E0073
"
:
u""
,
u"
\U000E0074
"
:
u""
,
u"
\U000E0075
"
:
u""
,
u"
\U000E0076
"
:
u""
,
u"
\U000E0077
"
:
u""
,
u"
\U000E0078
"
:
u""
,
u"
\U000E0079
"
:
u""
,
u"
\U000E007A
"
:
u""
,
u"
\U000E007B
"
:
u""
,
u"
\U000E007C
"
:
u""
,
u"
\U000E007D
"
:
u""
,
u"
\U000E007E
"
:
u""
,
u"
\U000E007F
"
:
u""
,
# Musical Notation Scoping
u"
\U0001D173
"
:
u""
,
u"
\U0001D174
"
:
u""
,
u"
\U0001D175
"
:
u""
,
u"
\U0001D176
"
:
u""
,
u"
\U0001D177
"
:
u""
,
u"
\U0001D178
"
:
u""
,
u"
\U0001D179
"
:
u""
,
u"
\U0001D17A
"
:
u""
,
u'
\u0000
'
:
u""
,
# NULL
u'
\u0001
'
:
u""
,
# START OF HEADING
# START OF TEXT & END OF TEXT:
u'
\u0002
'
:
u""
,
u'
\u0003
'
:
u""
,
u'
\u0004
'
:
u""
,
# END OF TRANSMISSION
# ENQ and ACK
u'
\u0005
'
:
u""
,
u'
\u0006
'
:
u""
,
u'
\u0007
'
:
u""
,
# BELL
u'
\u0008
'
:
u""
,
# BACKSPACE
# SHIFT-IN & SHIFT-OUT
u'
\u000E
'
:
u""
,
u'
\u000F
'
:
u""
,
# Other controls:
u'
\u0010
'
:
u""
,
# DATA LINK ESCAPE
u'
\u0011
'
:
u""
,
# DEVICE CONTROL ONE
u'
\u0012
'
:
u""
,
# DEVICE CONTROL TWO
u'
\u0013
'
:
u""
,
# DEVICE CONTROL THREE
u'
\u0014
'
:
u""
,
# DEVICE CONTROL FOUR
u'
\u0015
'
:
u""
,
# NEGATIVE ACK
u'
\u0016
'
:
u""
,
# SYNCRONOUS IDLE
u'
\u0017
'
:
u""
,
# END OF TRANSMISSION BLOCK
u'
\u0018
'
:
u""
,
# CANCEL
u'
\u0019
'
:
u""
,
# END OF MEDIUM
u'
\u001A
'
:
u""
,
# SUBSTITUTE
u'
\u001B
'
:
u""
,
# ESCAPE
u'
\u001C
'
:
u""
,
# INFORMATION SEPARATOR FOUR (file separator)
u'
\u001D
'
:
u""
,
# INFORMATION SEPARATOR THREE (group separator)
u'
\u001E
'
:
u""
,
# INFORMATION SEPARATOR TWO (record separator)
u'
\u001F
'
:
u""
,
# INFORMATION SEPARATOR ONE (unit separator)
# \r -> remove it
u'
\r
'
:
u""
,
# Strange parantheses - change for normal:
u'
\x1c
'
:
u'('
,
u'
\x1d
'
:
u')'
,
# Some ff from tex:
u'
\u0013\u0010
'
:
u'
\u00ED
'
,
u'
\x0b
'
:
u'ff'
,
# fi from tex:
u'
\x0c
'
:
u'fi'
,
# ligatures from TeX:
u'
\ufb00
'
:
u'ff'
,
u'
\ufb01
'
:
u'fi'
,
u'
\ufb02
'
:
u'fl'
,
u'
\ufb03
'
:
u'ffi'
,
u'
\ufb04
'
:
u'ffl'
,
# Superscripts from TeX
u'
\u2212
'
:
u'-'
,
u'
\u2013
'
:
u'-'
,
# Word style speech marks:
u'
\u201c
'
:
u'"'
,
u'
\u201d
'
:
u'"'
,
u'
\u201c
'
:
u'"'
,
# pdftotext has problems with umlaut and prints it as diaeresis
# followed by a letter:correct it
# (Optional space between char and letter - fixes broken
# line examples)
u'
\u00A8
a'
:
u'
\u00E4
'
,
u'
\u00A8
e'
:
u'
\u00EB
'
,
u'
\u00A8
i'
:
u'
\u00EF
'
,
u'
\u00A8
o'
:
u'
\u00F6
'
,
u'
\u00A8
u'
:
u'
\u00FC
'
,
u'
\u00A8
y'
:
u'
\u00FF
'
,
u'
\u00A8
A'
:
u'
\u00C4
'
,
u'
\u00A8
E'
:
u'
\u00CB
'
,
u'
\u00A8
I'
:
u'
\u00CF
'
,
u'
\u00A8
O'
:
u'
\u00D6
'
,
u'
\u00A8
U'
:
u'
\u00DC
'
,
u'
\u00A8
Y'
:
u'
\u0178
'
,
u'
\xA8
a'
:
u'
\u00E4
'
,
u'
\xA8
e'
:
u'
\u00EB
'
,
u'
\xA8
i'
:
u'
\u00EF
'
,
u'
\xA8
o'
:
u'
\u00F6
'
,
u'
\xA8
u'
:
u'
\u00FC
'
,
u'
\xA8
y'
:
u'
\u00FF
'
,
u'
\xA8
A'
:
u'
\u00C4
'
,
u'
\xA8
E'
:
u'
\u00CB
'
,
u'
\xA8
I'
:
u'
\u00CF
'
,
u'
\xA8
O'
:
u'
\u00D6
'
,
u'
\xA8
U'
:
u'
\u00DC
'
,
u'
\xA8
Y'
:
u'
\u0178
'
,
# More umlaut mess to correct:
u'
\x7f
a'
:
u'
\u00E4
'
,
u'
\x7f
e'
:
u'
\u00EB
'
,
u'
\x7f
i'
:
u'
\u00EF
'
,
u'
\x7f
o'
:
u'
\u00F6
'
,
u'
\x7f
u'
:
u'
\u00FC
'
,
u'
\x7f
y'
:
u'
\u00FF
'
,
u'
\x7f
A'
:
u'
\u00C4
'
,
u'
\x7f
E'
:
u'
\u00CB
'
,
u'
\x7f
I'
:
u'
\u00CF
'
,
u'
\x7f
O'
:
u'
\u00D6
'
,
u'
\x7f
U'
:
u'
\u00DC
'
,
u'
\x7f
Y'
:
u'
\u0178
'
,
u'
\x7f
a'
:
u'
\u00E4
'
,
u'
\x7f
e'
:
u'
\u00EB
'
,
u'
\x7f
i'
:
u'
\u00EF
'
,
u'
\x7f
o'
:
u'
\u00F6
'
,
u'
\x7f
u'
:
u'
\u00FC
'
,
u'
\x7f
y'
:
u'
\u00FF
'
,
u'
\x7f
A'
:
u'
\u00C4
'
,
u'
\x7f
E'
:
u'
\u00CB
'
,
u'
\x7f
I'
:
u'
\u00CF
'
,
u'
\x7f
O'
:
u'
\u00D6
'
,
u'
\x7f
U'
:
u'
\u00DC
'
,
u'
\x7f
Y'
:
u'
\u0178
'
,
# pdftotext: fix accute accent:
u'
\x13
a'
:
u'
\u00E1
'
,
u'
\x13
e'
:
u'
\u00E9
'
,
u'
\x13
i'
:
u'
\u00ED
'
,
u'
\x13
o'
:
u'
\u00F3
'
,
u'
\x13
u'
:
u'
\u00FA
'
,
u'
\x13
y'
:
u'
\u00FD
'
,
u'
\x13
A'
:
u'
\u00C1
'
,
u'
\x13
E'
:
u'
\u00C9
'
,
u'
\x13
I'
:
u'
\u00CD
'
,
u'
\x13
ı'
:
u'
\u00ED
'
,
# Lower case turkish 'i' (dotless i)
u'
\x13
O'
:
u'
\u00D3
'
,
u'
\x13
U'
:
u'
\u00DA
'
,
u'
\x13
Y'
:
u'
\u00DD
'
,
u'
\x13
a'
:
u'
\u00E1
'
,
u'
\x13
e'
:
u'
\u00E9
'
,
u'
\x13
i'
:
u'
\u00ED
'
,
u'
\x13
o'
:
u'
\u00F3
'
,
u'
\x13
u'
:
u'
\u00FA
'
,
u'
\x13
y'
:
u'
\u00FD
'
,
u'
\x13
A'
:
u'
\u00C1
'
,
u'
\x13
E'
:
u'
\u00C9
'
,
u'
\x13
I'
:
u'
\u00CD
'
,
u'
\x13
ı'
:
u'
\u00ED
'
,
u'
\x13
O'
:
u'
\u00D3
'
,
u'
\x13
U'
:
u'
\u00DA
'
,
u'
\x13
Y'
:
u'
\u00DD
'
,
u'
\u00B4
a'
:
u'
\u00E1
'
,
u'
\u00B4
e'
:
u'
\u00E9
'
,
u'
\u00B4
i'
:
u'
\u00ED
'
,
u'
\u00B4
o'
:
u'
\u00F3
'
,
u'
\u00B4
u'
:
u'
\u00FA
'
,
u'
\u00B4
y'
:
u'
\u00FD
'
,
u'
\u00B4
A'
:
u'
\u00C1
'
,
u'
\u00B4
E'
:
u'
\u00C9
'
,
u'
\u00B4
I'
:
u'
\u00CD
'
,
u'
\u00B4
ı'
:
u'
\u00ED
'
,
u'
\u00B4
O'
:
u'
\u00D3
'
,
u'
\u00B4
U'
:
u'
\u00DA
'
,
u'
\u00B4
Y'
:
u'
\u00DD
'
,
u'
\u00B4
a'
:
u'
\u00E1
'
,
u'
\u00B4
e'
:
u'
\u00E9
'
,
u'
\u00B4
i'
:
u'
\u00ED
'
,
u'
\u00B4
o'
:
u'
\u00F3
'
,
u'
\u00B4
u'
:
u'
\u00FA
'
,
u'
\u00B4
y'
:
u'
\u00FD
'
,
u'
\u00B4
A'
:
u'
\u00C1
'
,
u'
\u00B4
E'
:
u'
\u00C9
'
,
u'
\u00B4
I'
:
u'
\u00CD
'
,
u'
\u00B4
ı'
:
u'
\u00ED
'
,
u'
\u00B4
O'
:
u'
\u00D3
'
,
u'
\u00B4
U'
:
u'
\u00DA
'
,
u'
\u00B4
Y'
:
u'
\u00DD
'
,
# pdftotext: fix grave accent:
u'
\u0060
a'
:
u'
\u00E0
'
,
u'
\u0060
e'
:
u'
\u00E8
'
,
u'
\u0060
i'
:
u'
\u00EC
'
,
u'
\u0060
o'
:
u'
\u00F2
'
,
u'
\u0060
u'
:
u'
\u00F9
'
,
u'
\u0060
A'
:
u'
\u00C0
'
,
u'
\u0060
E'
:
u'
\u00C8
'
,
u'
\u0060
I'
:
u'
\u00CC
'
,
u'
\u0060
O'
:
u'
\u00D2
'
,
u'
\u0060
U'
:
u'
\u00D9
'
,
u'
\u0060
a'
:
u'
\u00E0
'
,
u'
\u0060
e'
:
u'
\u00E8
'
,
u'
\u0060
i'
:
u'
\u00EC
'
,
u'
\u0060
o'
:
u'
\u00F2
'
,
u'
\u0060
u'
:
u'
\u00F9
'
,
u'
\u0060
A'
:
u'
\u00C0
'
,
u'
\u0060
E'
:
u'
\u00C8
'
,
u'
\u0060
I'
:
u'
\u00CC
'
,
u'
\u0060
O'
:
u'
\u00D2
'
,
u'
\u0060
U'
:
u'
\u00D9
'
,
u'a´'
:
u'á'
,
u'i´'
:
u'í'
,
u'e´'
:
u'é'
,
u'u´'
:
u'ú'
,
u'o´'
:
u'ó'
,
# \02C7 : caron
u'
\u02C7
C'
:
u'
\u010C
'
,
u'
\u02C7
c'
:
u'
\u010D
'
,
u'
\u02C7
S'
:
u'
\u0160
'
,
u'
\u02C7
s'
:
u'
\u0161
'
,
u'
\u02C7
Z'
:
u'
\u017D
'
,
u'
\u02C7
z'
:
u'
\u017E
'
,
# \027 : aa (a with ring above)
u'
\u02DA
a'
:
u'
\u00E5
'
,
u'
\u02DA
A'
:
u'
\u00C5
'
,
# \030 : cedilla
u'
\u0327
c'
:
u'
\u00E7
'
,
u'
\u0327
C'
:
u'
\u00C7
'
,
u'¸c'
:
u'ç'
,
# \02DC : tilde
u'
\u02DC
n'
:
u'
\u00F1
'
,
u'
\u02DC
N'
:
u'
\u00D1
'
,
u'
\u02DC
o'
:
u'
\u00F5
'
,
u'
\u02DC
O'
:
u'
\u00D5
'
,
u'
\u02DC
a'
:
u'
\u00E3
'
,
u'
\u02DC
A'
:
u'
\u00C3
'
,
u'
\u02DC
s'
:
u'
\u0303
s'
,
# Combining tilde with 's'
# Circumflex accent (caret accent)
u'aˆ'
:
u'â'
,
u'iˆ'
:
u'î'
,
u'eˆ'
:
u'ê'
,
u'uˆ'
:
u'û'
,
u'oˆ'
:
u'ô'
,
u'ˆa'
:
u'â'
,
u'ˆi'
:
u'î'
,
u'ˆe'
:
u'ê'
,
u'ˆu'
:
u'û'
,
u'ˆo'
:
u'ô'
,
}
UNDESIRABLE_STRING_REPLACEMENTS
=
[
(
u'
\u201c
'
,
'"'
),
]
def
replace_undesirable_characters
(
line
):
"""
Replace certain bad characters in a text line.
@param line: (string) the text line in which bad characters are to
be replaced.
@return: (string) the text line after the bad characters have been
replaced.
"""
# These are separate because we want a particular order
for
bad_string
,
replacement
in
UNDESIRABLE_STRING_REPLACEMENTS
:
line
=
line
.
replace
(
bad_string
,
replacement
)
for
bad_char
,
replacement
in
iteritems
(
UNDESIRABLE_CHAR_REPLACEMENTS
):
line
=
line
.
replace
(
bad_char
,
replacement
)
return
line
def
pdftotext_conversion_is_bad
(
txtlines
):
"""Sometimes pdftotext performs a bad conversion which consists of many
spaces and garbage characters.
This method takes a list of strings obtained from a pdftotext conversion
and examines them to see if they are likely to be the result of a bad
conversion.
@param txtlines: (list) of unicode strings obtained from pdftotext
conversion.
@return: (integer) - 1 if bad conversion; 0 if good conversion.
"""
# Numbers of 'words' and 'whitespaces' found in document:
numWords
=
numSpaces
=
0
# whitespace character pattern:
p_space
=
re
.
compile
(
unicode
(
r'(\s)'
),
re
.
UNICODE
)
# non-whitespace 'word' pattern:
p_noSpace
=
re
.
compile
(
unicode
(
r'(\S+)'
),
re
.
UNICODE
)
for
txtline
in
txtlines
:
numWords
=
numWords
+
len
(
p_noSpace
.
findall
(
txtline
.
strip
()))
numSpaces
=
numSpaces
+
len
(
p_space
.
findall
(
txtline
.
strip
()))
if
numSpaces
>=
(
numWords
*
3
):
# Too many spaces - probably bad conversion
return
True
else
:
return
False
def
convert_PDF_to_plaintext
(
fpath
,
keep_layout
=
False
):
""" Convert PDF to txt using pdftotext
Take the path to a PDF file and run pdftotext for this file, capturing
the output.
@param fpath: (string) path to the PDF file
@return: (list) of unicode strings (contents of the PDF file translated
into plaintext; each string is a line in the document.)
"""
if
keep_layout
:
layout_option
=
"-layout"
else
:
layout_option
=
"-raw"
status
=
0
doclines
=
[]
# Pattern to check for lines with a leading page-break character.
# If this pattern is matched, we want to split the page-break into
# its own line because we rely upon this for trying to strip headers
# and footers, and for some other pattern matching.
p_break_in_line
=
re
.
compile
(
ur'^\s*\f(.+)$'
,
re
.
UNICODE
)
# build pdftotext command:
cmd_pdftotext
=
[
CFG_PATH_PDFTOTEXT
,
layout_option
,
"-q"
,
"-enc"
,
"UTF-8"
,
fpath
,
"-"
]
write_message
(
"*
%s
"
%
' '
.
join
(
cmd_pdftotext
),
verbose
=
2
)
# open pipe to pdftotext:
pipe_pdftotext
=
subprocess
.
Popen
(
cmd_pdftotext
,
stdout
=
subprocess
.
PIPE
)
# read back results:
for
docline
in
pipe_pdftotext
.
stdout
:
unicodeline
=
docline
.
decode
(
"utf-8"
)
# Check for a page-break in this line:
m_break_in_line
=
p_break_in_line
.
match
(
unicodeline
)
if
m_break_in_line
is
None
:
# There was no page-break in this line. Just add the line:
doclines
.
append
(
unicodeline
)
else
:
# If there was a page-break character in the same line as some
# text, split it out into its own line so that we can later
# try to find headers and footers:
doclines
.
append
(
u"
\f
"
)
doclines
.
append
(
m_break_in_line
.
group
(
1
))
write_message
(
"* convert_PDF_to_plaintext found: "
\
"
%s
lines of text"
%
len
(
doclines
),
verbose
=
2
)
# finally, check conversion result not bad:
if
pdftotext_conversion_is_bad
(
doclines
):
status
=
2
doclines
=
[]
return
(
doclines
,
status
)
Event Timeline
Log In to Comment