hocrlib.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Jul 26, 14:26

hocrlib.py
View Options

	## This file is part of CDS Invenio.
	## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
	##
	## CDS Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## CDS Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""hOCR parser and tools"""

	from htmlentitydefs import entitydefs
	import HTMLParser
	import re
	import os.path
	from logging import info
	from reportlab.pdfgen.canvas import Canvas
	from reportlab.lib.pagesizes import A4
	from reportlab.lib.colors import green, red

	_RE_PARSE_HOCR_BBOX = re.compile(r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
	_RE_CLEAN_SPACES = re.compile(r'\s+')
	def extract_hocr(hocr_text):
	"""
	Parse hocr_text and return a structure suitable to be used by create_pdf.
	"""
	class HOCRReader(HTMLParser.HTMLParser):
	def __init__(self):
	HTMLParser.HTMLParser.__init__(self)
	self.lines = []
	self.bbox = None
	self.text = ""
	self.image = ''
	self.page_bbox = None
	self.pages = []
	self.started = False

	def store_current_page(self):
	if self.image:
	self.store_current_line()
	self.sort_current_lines()
	self.pages.append((self.page_bbox, self.image, self.lines))
	self.page_bbox = None
	self.image = ''
	self.lines = []

	def sort_current_lines(self):
	def line_cmp(a, b):
	y0_a = a[0][1]
	y0_b = b[0][1]
	return cmp(y0_b, y0_a)

	self.lines.sort(line_cmp)

	def store_current_line(self):
	if self.bbox:
	self.lines.append((self.bbox, _RE_CLEAN_SPACES.sub(' ', self.text).strip()))
	self.bbox = None
	self.text = ""

	def extract_hocr_properties(self, title):
	properties = title.split(';')
	ret = {}
	for prop in properties:
	prop = prop.strip()
	key, value = prop.split(' ', 1)
	key = key.strip().lower()
	value = value.strip()
	ret[key] = value
	return ret

	def handle_starttag(self, tag, attrs):
	attrs = dict(attrs)
	if attrs.get('class') == 'ocr_line':
	self.started = True
	self.store_current_line()
	properties = self.extract_hocr_properties(attrs.get('title', ''))
	try:
	self.bbox = tuple(map(lambda x: int(x), properties['bbox'].split(' ', 4)))
	except:
	## If no bbox is retrievable, let's skip this line
	pass
	elif attrs.get('class') == 'ocr_page':
	self.store_current_page()
	properties = self.extract_hocr_properties(attrs.get('title', ''))
	try:
	self.page_bbox = tuple(map(lambda x: int(x), properties['bbox'].split(' ', 4)))
	except:
	## If no bbox is retrievable, let's skip this line
	pass
	try:
	self.image = os.path.abspath(properties['image'])
	except:
	pass

	def handle_entityref(self, name):
	if self.started and name in entitydefs:
	self.text += entitydefs[name].decode('latin1').encode('utf8')

	def handle_data(self, data):
	if self.started and data.strip():
	self.text += data

	def handle_charref(self, data):
	if self.started:
	try:
	self.text += unichr(int(data)).encode('utf8')
	except:
	pass

	def close(self):
	HTMLParser.HTMLParser.close(self)
	self.store_current_page()

	hocr_reader = HOCRReader()
	hocr_reader.feed(hocr_text)
	hocr_reader.close()
	return hocr_reader.pages

	def create_pdf(hocr, filename, font="Courier", author=None, keywords=None, subject=None, title=None, image_path=None, draft=False):
	""" transform hOCR information into a searchable PDF.
	@param hocr the hocr structure as coming from extract_hocr.
	@param filename the name of the PDF generated in output.
	@param font the default font (e.g. Courier, Times-Roman).
	@param author the author name.
	@param subject the subject of the document.
	@param title the title of the document.
	@param image_path the default path where images are stored. If not specified
	relative image paths will be resolved to the current directory.
	@param draft whether to enable debug information in the output.

	"""
	def adjust_image_size(width, height):
	return max(width / A4[0], height / A4[1])

	canvas = Canvas(filename)

	if author:
	canvas.setAuthor(author)

	if keywords:
	canvas.setKeywords(keywords)

	if title:
	canvas.setTitle(title)

	if subject:
	canvas.setSubject(subject)

	for bbox, image, lines in hocr:
	if not image.startswith('/') and image_path:
	image = os.path.abspath(os.path.join(image_path, image))
	img_width, img_height = bbox[2:]
	ratio = adjust_image_size(img_width, img_height)
	if draft:
	canvas.drawImage(image, 0, A4[1] - img_height / ratio , img_width / ratio, img_height / ratio)
	canvas.setFont(font, 12)
	for bbox, line in lines:
	if draft:
	canvas.setFillColor(red)
	x0, y0, x1, y1 = bbox
	width = (x1 - x0) / ratio
	height = ((y1 - y0) / ratio)
	x0 = x0 / ratio
	#for ch in 'gjpqy,(){}[];$@':
	#if ch in line:
	#y0 = A4[1] - (y0 / ratio) - height
	#break
	#else:
	y0 = A4[1] - (y0 / ratio) - height / 1.3
	#canvas.setFontSize(height * 1.5)
	canvas.setFontSize(height)
	text_width = canvas.stringWidth(line)
	if text_width:
	## If text_width != 0
	text_object = canvas.beginText(x0, y0)
	text_object.setHorizScale(1.0 * width / text_width * 100)
	text_object.textOut(line)
	canvas.drawText(text_object)
	else:
	info('%s, %s has width 0' % (bbox, line))
	if draft:
	canvas.setStrokeColor(green)
	canvas.rect(x0, y0, width, height)
	if draft:
	canvas.circle(0, 0, 10, fill=1)
	canvas.circle(0, A4[1], 10, fill=1)
	canvas.circle(A4[0], 0, 10, fill=1)
	canvas.circle(A4[0], A4[1], 10, fill=1)
	canvas.setFillColor(green)
	canvas.setStrokeColor(green)
	canvas.circle(0, A4[1] - img_height / ratio, 5, fill=1)
	canvas.circle(img_width / ratio, img_height /ratio, 5, fill=1)
	else:
	canvas.drawImage(image, 0, A4[1] - img_height / ratio , img_width / ratio, img_height / ratio)

	canvas.save()

hocrlib.pyNo OneTemporaryActions

File Metadata

hocrlib.pyView Options

Event Timeline

hocrlib.py
No OneTemporary
Actions

hocrlib.py
View Options