Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86601711
main.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Oct 7, 11:59
Size
7 KB
Mime Type
text/x-python
Expires
Wed, Oct 9, 11:59 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21455447
Attached To
R11149 PDM-Nicola-Oulu
main.py
View Options
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
from
requests_html
import
HTMLSession
from
selenium
import
webdriver
import
time
import
json
import
random
import
copy
def
print_hi
(
name
):
# Use a breakpoint in the code line below to debug your script.
print
(
f
'Hi, {name}'
)
# Press Ctrl+F8 to toggle the breakpoint.
def
crawl
():
# in jupyter notebook/spyder there is already a async session
# https://github.com/psf/requests-html/issues/294#issuecomment-516709659
url
=
"https://www.fonecta.fi/haku/a?location=oulu"
s
=
HTMLSession
()
r
=
s
.
get
(
url
)
r
.
html
.
render
(
sleep
=
1
)
print
(
r
.
status_code
)
# should be 200
l
=
r
.
html
.
find
(
".ResultFilterContainerComponent_resultFilterOptions__3WUdC"
)
print
(
l
)
print
(
len
(
l
))
script
=
"""
() => {
$(document).ready(function() {
$("#submit_button").click();
})
}
"""
r
.
html
.
render
(
script
=
script
,
reload
=
False
)
# print(r.html.text)
def
append_list
(
l
,
ele
):
if
ele
in
l
:
print
(
"already exists"
)
else
:
l
.
append
(
ele
)
return
l
def
crawl_links
(
driver
,
l
,
data
):
time
.
sleep
(
random
.
randint
(
1
,
10
))
names
=
copy
.
deepcopy
(
data
[
"names"
])
addresses
=
copy
.
deepcopy
(
data
[
"addresses"
])
emails
=
copy
.
deepcopy
(
data
[
"emails"
])
driver
.
get
(
l
)
time
.
sleep
(
1
)
# get address
try
:
address
=
driver
.
find_elements_by_css_selector
(
".profile_header_component_profileAddressRowLnk__2eMaT span"
)
tmp
=
address
[
1
]
.
get_attribute
(
'innerHTML'
)
print
(
tmp
)
addresses
=
append_list
(
addresses
,
tmp
)
except
:
pass
# get name
try
:
address
=
driver
.
find_elements_by_css_selector
(
".profile_header_component_profile-information-name__1P390"
)
tmp
=
address
[
0
]
.
get_attribute
(
'innerHTML'
)
print
(
tmp
)
names
=
append_list
(
names
,
tmp
)
except
:
pass
# get the email
try
:
address
=
driver
.
find_elements_by_css_selector
(
".contact_info_row_email__2OtOQ span"
)
tmp
=
address
[
1
]
.
get_attribute
(
'innerHTML'
)
print
(
tmp
)
emails
=
append_list
(
emails
,
tmp
)
except
:
pass
if
False
:
element
=
address
driver
.
execute_script
(
"""
var element = arguments[0];
element.parentNode.removeChild(element);
"""
,
element
)
if
False
:
element
=
address
attrs
=
driver
.
execute_script
(
'var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;'
,
element
)
print
(
attrs
)
time
.
sleep
(
1
)
data
[
"names"
]
=
names
data
[
"addresses"
]
=
addresses
data
[
"emails"
]
=
emails
return
data
def
search_start_url
(
driver
,
url
,
data
,
links_json
):
print
(
url
)
driver
.
get
(
url
)
time
.
sleep
(
1
)
# confirm the use of cokies ect.
try
:
button
=
driver
.
find_element_by_xpath
(
"// *[ @ id =
\"
onetrust-accept-btn-handler
\"
]"
)
button
.
click
()
except
:
pass
time
.
sleep
(
1
)
try
:
button
=
driver
.
find_element_by_xpath
(
"//*[@id=
\"
__next
\"
]/div/div[2]/div[3]/div[1]/div[1]/div[1]/button[3]"
)
print
(
button
)
time
.
sleep
(
1
)
button
.
click
()
print
(
"clicked on button"
)
time
.
sleep
(
5
)
print
(
"results"
)
links
=
[]
for
i
in
range
(
2
,
10
):
try
:
href
=
driver
.
find_element_by_xpath
(
"// *[ @ id =
\"
__next
\"
] / div / div[2] "
+
"/ div[3] / div[2] / div[1] / div[1] / div / div["
+
str
(
i
)
+
"] / div / div[2] / a[1]"
)
links
.
append
(
href
.
get_attribute
(
'href'
))
except
:
print
(
"failed for link {}"
.
format
(
i
))
time
.
sleep
(
1
)
for
l
in
links
:
if
l
not
in
links_json
:
data
=
crawl_links
(
driver
,
l
,
data
)
links_json
.
append
(
l
)
with
open
(
'data_fi.json'
,
'w'
)
as
fp
:
json
.
dump
(
data
,
fp
)
print
(
"# names, # addresses, # emails"
)
print
(
str
(
len
(
data
[
'names'
]))
+
"/"
+
str
(
len
(
data
[
'addresses'
]))
+
"/"
+
str
(
len
(
data
[
'emails'
])))
with
open
(
'links_fi.json'
,
'w'
)
as
fp
:
json
.
dump
(
links_json
,
fp
)
except
:
links
=
links_json
print
(
"failed to perform a search"
)
pass
return
data
,
links
def
crawl_selenium
():
data
=
{
"names"
:
[],
"addresses"
:
[],
"emails"
:
[],
}
links
=
[]
try
:
with
open
(
'data_fi.json'
,
'r'
)
as
fp
:
data
=
json
.
load
(
fp
)
with
open
(
'links_fi.json'
,
'r'
)
as
fp
:
links
=
json
.
load
(
fp
)
except
:
pass
PATH
=
"C:\Program Files (x86)\chromedriver.exe"
driver
=
webdriver
.
Chrome
(
PATH
)
abc
=
"abcdefghijklmnopqrstuvwxyz"
if
False
:
for
letter
in
abc
:
url
=
"https://www.fonecta.fi/haku/"
+
letter
data
,
links
=
search_start_url
(
driver
,
url
,
data
,
links
)
print
(
data
[
"names"
])
def
clean_names
(
names
):
first
=
[]
last
=
[]
for
n
in
names
:
tmp
=
n
.
split
(
" "
)
# if more then 2 spaces are used, it's most likely not a name
if
len
(
tmp
)
!=
2
:
names
.
remove
(
n
)
else
:
first
.
append
(
tmp
[
0
])
last
.
append
(
tmp
[
1
])
return
names
,
list
(
set
(
first
)),
list
(
set
(
last
))
def
clean_loc
(
addresses
):
place
=
[]
street
=
[]
zipc
=
[]
for
a
in
addresses
:
tmp
=
a
.
split
(
", "
)
street
.
append
(
tmp
[
0
])
tmp
=
tmp
[
1
]
.
split
(
""
)
place
.
append
(
tmp
[
-
1
])
zipc
.
append
(
tmp
[
-
2
])
return
place
,
street
,
zipc
data
[
"names"
],
first
,
last
=
clean_names
(
data
[
"names"
])
random
.
shuffle
(
first
)
print
(
len
(
last
))
print
(
len
(
first
))
print
(
first
)
if
True
:
for
f
in
first
:
url
=
"https://www.fonecta.fi/haku/"
+
f
data
,
links
=
search_start_url
(
driver
,
url
,
data
,
links
)
data
[
"names"
],
first
,
last
=
clean_names
(
data
[
"names"
])
print
(
last
)
random
.
shuffle
(
last
)
print
(
len
(
last
))
print
(
len
(
first
))
for
l
in
last
:
url
=
"https://www.fonecta.fi/haku/"
+
l
print
(
url
)
data
,
links
=
search_start_url
(
driver
,
url
,
data
,
links
)
place
,
street
,
zipc
=
clean_loc
(
data
[
"addresses"
])
for
l
in
last
:
for
p
in
place
:
url
=
"https://www.fonecta.fi/haku/"
+
l
+
"?location="
+
p
data
,
links
=
search_start_url
(
driver
,
url
,
data
,
links
)
time
.
sleep
(
100
)
# // *[ @ id = "__next"] / div / div[2] / div[3] / div[2] / div[1] / div[1] / div / div[2] / div / div[2] / a[1]
driver
.
close
()
pass
# Press the green button in the gutter to run the script.
if
__name__
==
'__main__'
:
print_hi
(
'PyCharm'
)
crawl_selenium
()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
Event Timeline
Log In to Comment