Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F74480762
PdfExtractorFactory.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Jul 28, 01:25
Size
3 KB
Mime Type
text/x-python
Expires
Tue, Jul 30, 01:25 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
19399707
Attached To
R10013 cop-mining-participants
PdfExtractorFactory.py
View Options
import
os
from
pathlib
import
Path
from
partlistproc.DigitalPdfExtractor
import
DigitalPdfExtractor
from
partlistproc.OcrExtractor
import
OcrExtractor
class
PdfExtractorFactory
():
""" finds the accurate PdfExtractor to use """
meetings_that_need_ocr
=
[
"cop1"
,
"cop2"
,
"cop3"
,
"cop4"
,
"cop7"
,
"cop8"
,
"sb1"
,
"sb2"
,
"sb4"
,
"sb5"
,
"sb6"
,
"sb7"
,
"sb10"
,
"sb12"
,
"sb13"
]
meetings_with_several_pdfs
=
[
"cop11"
,
"cop13"
,
"cop14"
,
"cop15"
,
"cop16"
,
"cop17"
,
"cop21"
,
"cop22"
]
meetings_with_corrigendum
=
[]
# TODO
# normally, the first page of the list is this
default_startpage
=
3
# for the meetings that are different, the start page is in this dict
custom_default_startpage
=
{
"cop2"
:
2
,
"cop5"
:
2
,
"cop23"
:
2
,
"cop24"
:
2
,
"cop25"
:
2
,
"sb1"
:
2
,
"sb2"
:
2
,
"sb4"
:
2
,
"sb5"
:
2
,
"sb7"
:
2
,
"sb46"
:
2
,
"sb48"
:
2
,
"sb50"
:
2
}
# same for the end page
default_endpage
=
0
custom_default_endpage
=
{
"cop1"
:
126
}
def
__init__
(
self
,
label
,
output_file
):
""" Constructor of this class
Args:
label (str): label of the meeting to process
output_file (string): name of the file to put the text in
"""
self
.
label
=
label
# check if we have list for this label
if
not
(
os
.
path
.
isfile
(
DigitalPdfExtractor
.
getPDFpath
(
label
))
or
os
.
path
.
isfile
(
DigitalPdfExtractor
.
getPDFpath
(
label
,
1
))):
raise
ValueError
(
"For this meeting, no PDF list is located in the data folder."
)
self
.
output_file
=
output_file
def
createPdfExtractor
(
self
):
""" returns the accurate PdfExtractor that is correctly initialized
"""
startpage
=
self
.
custom_default_startpage
.
get
(
self
.
label
,
self
.
default_startpage
)
if
self
.
label
in
self
.
meetings_that_need_ocr
:
# Use OCR
endpage
=
self
.
custom_default_endpage
.
get
(
self
.
label
,
self
.
default_endpage
)
return
OcrExtractor
(
self
.
label
,
self
.
output_file
,
startpage
,
endpage
)
else
:
# Use PDF to txt
if
self
.
label
in
self
.
meetings_with_several_pdfs
:
# Pdf is splitted
i
=
1
path
=
DigitalPdfExtractor
.
getPDFpath
(
self
.
label
,
i
)
parts
=
0
while
os
.
path
.
isfile
(
path
):
parts
+=
1
i
+=
1
path
=
DigitalPdfExtractor
.
getPDFpath
(
self
.
label
,
i
)
return
DigitalPdfExtractor
(
DigitalPdfExtractor
.
getPDFpath
(
self
.
label
),
self
.
output_file
,
startpage
,
list_parts
=
parts
)
else
:
# cop5 has a special structure
if
self
.
label
==
"cop5"
:
return
DigitalPdfExtractor
(
DigitalPdfExtractor
.
getPDFpath
(
self
.
label
),
self
.
output_file
,
startpage
,
column_tolerance
=
50
)
extr
.
extract_text
()
else
:
# normal case: just one pdf
return
DigitalPdfExtractor
(
DigitalPdfExtractor
.
getPDFpath
(
self
.
label
),
self
.
output_file
,
startpage
)
extr
.
extract_text
()
Event Timeline
Log In to Comment