Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F101186230
read_pdf.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Feb 6, 14:31
Size
2 KB
Mime Type
text/x-python
Expires
Sat, Feb 8, 14:31 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
24095609
Attached To
R13363 Coling_RAG_exercise
read_pdf.py
View Options
from
langchain_community.document_loaders
import
PyPDFLoader
import
os
# from langchain.document_loaders import PyPDFLoader
from
pdf2image
import
convert_from_path
import
pytesseract
from
langchain.schema
import
Document
def
read_pdf
(
pdf_path
):
'''returns a list of pages from the pdf file
each page has two props: page_number and page_content
'''
real_pages
=
[]
# First attempt: Use PyPDFLoader
try
:
loader
=
PyPDFLoader
(
pdf_path
)
pages
=
loader
.
load_and_split
()
last_page_index
=
-
1
for
page
in
pages
:
print
(
f
"Page {page.metadata['page']}: {page.page_content[:100]}..."
)
print
(
'real_pages:'
,
len
(
real_pages
))
page_index
=
page
.
metadata
[
'page'
]
if
page_index
==
last_page_index
:
real_pages
[
-
1
]
.
page_content
+=
page
.
page_content
else
:
real_pages
.
append
(
page
)
last_page_index
=
page_index
except
Exception
as
e
:
print
(
f
'Error reading pdf {pdf_path} with PyPDFLoader: {e}'
)
# If PyPDFLoader failed or returned empty results, try OCR
if
not
real_pages
:
print
(
'PyPDFLoader failed or returned no pages. Attempting to read PDF using OCR...'
)
try
:
ocr_text
=
ocr_scanned_pdf
(
pdf_path
)
# Split the OCR text into pages
ocr_pages
=
ocr_text
.
split
(
'--- Page'
)
for
i
,
page_content
in
enumerate
(
ocr_pages
[
1
:],
start
=
1
):
# Skip the first split as it's empty
page_content
=
page_content
.
strip
()
if
page_content
:
real_pages
.
append
(
Document
(
page_content
=
page_content
,
metadata
=
{
'page'
:
i
}))
except
Exception
as
ocr_error
:
print
(
f
'Error reading pdf {pdf_path} with OCR: {ocr_error}'
)
return
real_pages
def
ocr_scanned_pdf
(
pdf_path
):
text
=
""
images
=
convert_from_path
(
pdf_path
)
for
i
,
image
in
enumerate
(
images
):
page_text
=
pytesseract
.
image_to_string
(
image
)
print
(
f
"OCR result for page {i+1}: {page_text[:100]}..."
)
text
+=
f
"--- Page {i+1} ---
\n
{page_text}
\n\n
"
return
text
if
__name__
==
'__main__'
:
# add current repo to path
pdf_path
=
'''./data/LEX_001.pdf'''
pages
=
read_pdf
(
pdf_path
)
#concatenate all pages
text
=
''
for
page
in
pages
:
text
+=
page
.
page_content
+
'
\n
'
#save to file
with
open
(
'./data/LEX_001.txt'
,
'w'
)
as
f
:
f
.
write
(
text
)
print
(
pages
)
Event Timeline
Log In to Comment