File Metadata

Created: Thu, Feb 6, 14:31

read_pdf.py
View Options

	from langchain_community.document_loaders import PyPDFLoader
	import os


	# from langchain.document_loaders import PyPDFLoader
	from pdf2image import convert_from_path
	import pytesseract
	from langchain.schema import Document


	def read_pdf(pdf_path):
	'''returns a list of pages from the pdf file
	each page has two props: page_number and page_content
	'''
	real_pages = []

	# First attempt: Use PyPDFLoader
	try:
	loader = PyPDFLoader(pdf_path)
	pages = loader.load_and_split()

	last_page_index = -1
	for page in pages:
	print(f"Page {page.metadata['page']}: {page.page_content[:100]}...")
	print('real_pages:', len(real_pages))
	page_index = page.metadata['page']
	if page_index == last_page_index:
	real_pages[-1].page_content += page.page_content
	else:
	real_pages.append(page)
	last_page_index = page_index

	except Exception as e:
	print(f'Error reading pdf {pdf_path} with PyPDFLoader: {e}')

	# If PyPDFLoader failed or returned empty results, try OCR
	if not real_pages:
	print('PyPDFLoader failed or returned no pages. Attempting to read PDF using OCR...')
	try:
	ocr_text = ocr_scanned_pdf(pdf_path)
	# Split the OCR text into pages
	ocr_pages = ocr_text.split('--- Page')
	for i, page_content in enumerate(ocr_pages[1:], start=1): # Skip the first split as it's empty
	page_content = page_content.strip()
	if page_content:
	real_pages.append(Document(page_content=page_content, metadata={'page': i}))
	except Exception as ocr_error:
	print(f'Error reading pdf {pdf_path} with OCR: {ocr_error}')

	return real_pages

	def ocr_scanned_pdf(pdf_path):
	text = ""
	images = convert_from_path(pdf_path)
	for i, image in enumerate(images):
	page_text = pytesseract.image_to_string(image)
	print(f"OCR result for page {i+1}: {page_text[:100]}...")
	text += f"--- Page {i+1} ---\n{page_text}\n\n"
	return text


	if __name__ == '__main__':
	# add current repo to path

	pdf_path = '''./data/LEX_001.pdf'''
	pages = read_pdf(pdf_path)

	#concatenate all pages
	text = ''
	for page in pages:
	text += page.page_content + '\n'

	#save to file
	with open('./data/LEX_001.txt', 'w') as f:
	f.write(text)

	print(pages)

read_pdf.py
No OneTemporary
Actions

File Metadata

read_pdf.py
View Options

Event Timeline

read_pdf.pyNo OneTemporaryActions

File Metadata

read_pdf.pyView Options

Event Timeline

read_pdf.py
No OneTemporary
Actions

read_pdf.py
View Options