Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F74601076
hocrlib.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Jul 28, 16:20
Size
7 KB
Mime Type
text/x-python
Expires
Tue, Jul 30, 16:20 (2 d)
Engine
blob
Format
Raw Data
Handle
19415067
Attached To
R3600 invenio-infoscience
hocrlib.py
View Options
## This file is part of Invenio.
## Copyright (C) 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""hOCR parser and tools"""
from
htmlentitydefs
import
entitydefs
import
HTMLParser
import
re
import
os.path
from
logging
import
info
from
reportlab.pdfgen.canvas
import
Canvas
from
reportlab.lib.units
import
inch
from
reportlab.lib.colors
import
green
,
red
CFG_PPM_RESOLUTION
=
300
_RE_PARSE_HOCR_BBOX
=
re
.
compile
(
r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)'
)
_RE_CLEAN_SPACES
=
re
.
compile
(
r'\s+'
)
def
extract_hocr
(
hocr_text
):
"""
Parse hocr_text and return a structure suitable to be used by create_pdf.
"""
class
HOCRReader
(
HTMLParser
.
HTMLParser
):
def
__init__
(
self
):
HTMLParser
.
HTMLParser
.
__init__
(
self
)
self
.
lines
=
[]
self
.
bbox
=
None
self
.
text
=
""
self
.
image
=
''
self
.
page_bbox
=
None
self
.
pages
=
[]
self
.
started
=
False
def
store_current_page
(
self
):
if
self
.
image
:
self
.
store_current_line
()
self
.
sort_current_lines
()
self
.
pages
.
append
((
self
.
page_bbox
,
self
.
image
,
self
.
lines
))
self
.
page_bbox
=
None
self
.
image
=
''
self
.
lines
=
[]
def
sort_current_lines
(
self
):
def
line_cmp
(
a
,
b
):
y0_a
=
a
[
0
][
1
]
y0_b
=
b
[
0
][
1
]
return
cmp
(
y0_b
,
y0_a
)
self
.
lines
.
sort
(
line_cmp
)
def
store_current_line
(
self
):
if
self
.
bbox
:
self
.
lines
.
append
((
self
.
bbox
,
_RE_CLEAN_SPACES
.
sub
(
' '
,
self
.
text
)
.
strip
()))
self
.
bbox
=
None
self
.
text
=
""
def
extract_hocr_properties
(
self
,
title
):
properties
=
title
.
split
(
';'
)
ret
=
{}
for
prop
in
properties
:
prop
=
prop
.
strip
()
key
,
value
=
prop
.
split
(
' '
,
1
)
key
=
key
.
strip
()
.
lower
()
value
=
value
.
strip
()
ret
[
key
]
=
value
return
ret
def
handle_starttag
(
self
,
tag
,
attrs
):
attrs
=
dict
(
attrs
)
if
tag
==
'title'
:
self
.
started
=
False
# prevents <title> of the OCR output to be put into text
elif
attrs
.
get
(
'class'
)
==
'ocr_line'
:
self
.
started
=
True
self
.
store_current_line
()
properties
=
self
.
extract_hocr_properties
(
attrs
.
get
(
'title'
,
''
))
try
:
self
.
bbox
=
tuple
(
map
(
lambda
x
:
int
(
x
),
properties
[
'bbox'
]
.
split
(
' '
,
4
)))
except
:
## If no bbox is retrievable, let's skip this line
pass
elif
attrs
.
get
(
'class'
)
==
'ocr_page'
:
self
.
store_current_page
()
properties
=
self
.
extract_hocr_properties
(
attrs
.
get
(
'title'
,
''
))
try
:
self
.
page_bbox
=
tuple
(
map
(
lambda
x
:
int
(
x
),
properties
[
'bbox'
]
.
split
(
' '
,
4
)))
except
:
## If no bbox is retrievable, let's skip this line
pass
try
:
self
.
image
=
os
.
path
.
abspath
(
properties
[
'image'
])
except
:
pass
def
handle_entityref
(
self
,
name
):
if
self
.
started
and
name
in
entitydefs
:
self
.
text
+=
entitydefs
[
name
]
.
decode
(
'latin1'
)
.
encode
(
'utf8'
)
def
handle_data
(
self
,
data
):
if
self
.
started
and
data
.
strip
():
self
.
text
+=
data
def
handle_charref
(
self
,
data
):
if
self
.
started
:
try
:
self
.
text
+=
unichr
(
int
(
data
))
.
encode
(
'utf8'
)
except
:
pass
def
close
(
self
):
HTMLParser
.
HTMLParser
.
close
(
self
)
self
.
store_current_page
()
hocr_reader
=
HOCRReader
()
hocr_reader
.
feed
(
hocr_text
)
hocr_reader
.
close
()
return
hocr_reader
.
pages
def
start_pdf
(
filename
,
author
=
None
,
keywords
=
None
,
subject
=
None
,
title
=
None
):
""" Starts a new pdf document
@param filename the name of the PDF generated in output.
@param author the author name.
@param subject the subject of the document.
@param title the title of the document.
"""
canvas
=
Canvas
(
filename
)
if
author
:
canvas
.
setAuthor
(
author
)
if
keywords
:
canvas
.
setKeywords
(
keywords
)
if
title
:
canvas
.
setTitle
(
title
)
if
subject
:
canvas
.
setSubject
(
subject
)
canvas
.
setPageCompression
(
1
)
return
canvas
def
add_pdf_page
(
canvas
,
hocr
,
font
=
'Courier'
,
draft
=
False
):
""" Add one page of hOCR output into a searchable PDF.
@param canvas the pdf being produced
@param hocr the hocr structure as coming from extract_hocr.
@param working dir with the image and ocr output.
@param image the image that has been ocred
@param font the default font (e.g. Courier, Times-Roman).
@param draft whether to enable debug information in the output.
"""
ratio
=
float
(
CFG_PPM_RESOLUTION
)
/
72.
# pix to pts
bbox
,
_
,
lines
=
hocr
img_width
,
img_height
=
bbox
[
2
:]
page_size
=
(
img_width
/
ratio
,
img_height
/
ratio
)
canvas
.
setPageSize
(
page_size
)
canvas
.
setFont
(
font
,
12
)
for
bbox
,
line
in
lines
:
if
draft
:
canvas
.
setFillColor
(
red
)
x0
,
y0
,
x1
,
y1
=
bbox
width
=
(
x1
-
x0
)
/
ratio
height
=
((
y1
-
y0
)
/
ratio
)
x0
=
x0
/
ratio
y0
=
page_size
[
1
]
-
(
y0
/
ratio
)
-
height
/
1.3
canvas
.
setFontSize
(
height
)
text_width
=
canvas
.
stringWidth
(
line
)
if
text_width
:
## If text_width != 0
text_object
=
canvas
.
beginText
(
x0
,
y0
)
text_object
.
setHorizScale
(
1.0
*
width
/
text_width
*
100
)
text_object
.
textOut
(
line
)
canvas
.
drawText
(
text_object
)
else
:
info
(
'
%s
,
%s
has width 0'
%
(
bbox
,
line
))
if
draft
:
canvas
.
setStrokeColor
(
green
)
canvas
.
rect
(
x0
,
y0
,
width
,
height
)
if
draft
:
canvas
.
circle
(
0
,
0
,
10
,
fill
=
1
)
canvas
.
circle
(
0
,
page_size
[
1
],
10
,
fill
=
1
)
canvas
.
circle
(
page_size
[
0
],
0
,
10
,
fill
=
1
)
canvas
.
circle
(
page_size
[
0
],
page_size
[
1
],
10
,
fill
=
1
)
canvas
.
setFillColor
(
green
)
canvas
.
setStrokeColor
(
green
)
canvas
.
circle
(
0
,
page_size
[
1
]
-
img_height
/
ratio
,
5
,
fill
=
1
)
canvas
.
circle
(
img_width
/
ratio
,
img_height
/
ratio
,
5
,
fill
=
1
)
canvas
.
showPage
()
canvas
.
save
()
def
close_pdf
(
canvas
):
""" Finishes the pdf file.
@param canvas the pdf being produced
"""
canvas
.
save
()
def
create_pdf
(
hocr
,
output_pdf
,
font
=
'Courier'
,
draft
=
False
):
""" transform hOCR information into a text-only PDF.
@param hocr the hocr structure as coming from extract_hocr.
@param filename the name of the PDF generated in output.
@param font the default font (e.g. Courier, Times-Roman).
@param draft whether to enable debug information in the output.
"""
canvas
=
start_pdf
(
output_pdf
)
for
page
in
hocr
:
add_pdf_page
(
canvas
,
page
,
font
,
draft
)
close_pdf
(
canvas
)
Event Timeline
Log In to Comment