Page MenuHomec4science

main.py
No OneTemporary

File Metadata

Created
Mon, Oct 7, 11:59
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
from requests_html import HTMLSession
from selenium import webdriver
import time
import json
import random
import copy
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
def crawl():
# in jupyter notebook/spyder there is already a async session
# https://github.com/psf/requests-html/issues/294#issuecomment-516709659
url = "https://www.fonecta.fi/haku/a?location=oulu"
s = HTMLSession()
r = s.get(url)
r.html.render(sleep=1)
print(r.status_code) # should be 200
l = r.html.find(".ResultFilterContainerComponent_resultFilterOptions__3WUdC")
print(l)
print(len(l))
script = """
() => {
$(document).ready(function() {
$("#submit_button").click();
})
}
"""
r.html.render(script=script, reload=False)
# print(r.html.text)
def append_list(l, ele):
if ele in l:
print("already exists")
else:
l.append(ele)
return l
def crawl_links(driver, l, data):
time.sleep(random.randint(1, 10))
names = copy.deepcopy(data["names"])
addresses = copy.deepcopy(data["addresses"])
emails = copy.deepcopy(data["emails"])
driver.get(l)
time.sleep(1)
# get address
try:
address = driver.find_elements_by_css_selector(
".profile_header_component_profileAddressRowLnk__2eMaT span")
tmp = address[1].get_attribute('innerHTML')
print(tmp)
addresses = append_list(addresses, tmp)
except:
pass
# get name
try:
address = driver.find_elements_by_css_selector(
".profile_header_component_profile-information-name__1P390")
tmp = address[0].get_attribute('innerHTML')
print(tmp)
names = append_list(names, tmp)
except:
pass
# get the email
try:
address = driver.find_elements_by_css_selector(
".contact_info_row_email__2OtOQ span")
tmp = address[1].get_attribute('innerHTML')
print(tmp)
emails = append_list(emails, tmp)
except:
pass
if False:
element = address
driver.execute_script("""
var element = arguments[0];
element.parentNode.removeChild(element);
""", element)
if False:
element = address
attrs = driver.execute_script(
'var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;',
element)
print(attrs)
time.sleep(1)
data["names"] = names
data["addresses"] = addresses
data["emails"] = emails
return data
def search_start_url(driver, url, data, links_json):
print(url)
driver.get(url)
time.sleep(1)
# confirm the use of cokies ect.
try:
button = driver.find_element_by_xpath("// *[ @ id = \"onetrust-accept-btn-handler\"]")
button.click()
except:
pass
time.sleep(1)
try:
button = driver.find_element_by_xpath("//*[@id=\"__next\"]/div/div[2]/div[3]/div[1]/div[1]/div[1]/button[3]")
print(button)
time.sleep(1)
button.click()
print("clicked on button")
time.sleep(5)
print("results")
links = []
for i in range(2, 10):
try:
href = driver.find_element_by_xpath("// *[ @ id = \"__next\"] / div / div[2] " +
"/ div[3] / div[2] / div[1] / div[1] / div / div[" + str(
i) + "] / div / div[2] / a[1]")
links.append(href.get_attribute('href'))
except:
print("failed for link {}".format(i))
time.sleep(1)
for l in links:
if l not in links_json:
data = crawl_links(driver, l, data)
links_json.append(l)
with open('data_fi.json', 'w') as fp:
json.dump(data, fp)
print("# names, # addresses, # emails")
print(str(len(data['names'])) + "/" +
str(len(data['addresses'])) + "/" +
str(len(data['emails'])))
with open('links_fi.json', 'w') as fp:
json.dump(links_json, fp)
except:
links = links_json
print("failed to perform a search")
pass
return data, links
def crawl_selenium():
data = {
"names": [],
"addresses": [],
"emails": [],
}
links = []
try:
with open('data_fi.json', 'r') as fp:
data = json.load(fp)
with open('links_fi.json', 'r') as fp:
links = json.load(fp)
except:
pass
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
abc = "abcdefghijklmnopqrstuvwxyz"
if False:
for letter in abc:
url = "https://www.fonecta.fi/haku/"+letter
data, links = search_start_url(driver, url, data, links)
print(data["names"])
def clean_names(names):
first = []
last = []
for n in names:
tmp = n.split(" ")
# if more then 2 spaces are used, it's most likely not a name
if len(tmp) != 2:
names.remove(n)
else:
first.append(tmp[0])
last.append(tmp[1])
return names, list(set(first)), list(set(last))
def clean_loc(addresses):
place = []
street = []
zipc = []
for a in addresses:
tmp = a.split(", ")
street.append(tmp[0])
tmp = tmp[1].split("")
place.append(tmp[-1])
zipc.append(tmp[-2])
return place, street, zipc
data["names"], first, last = clean_names(data["names"])
random.shuffle(first)
print(len(last))
print(len(first))
print(first)
if True:
for f in first:
url = "https://www.fonecta.fi/haku/"+f
data, links = search_start_url(driver, url, data, links)
data["names"], first, last = clean_names(data["names"])
print(last)
random.shuffle(last)
print(len(last))
print(len(first))
for l in last:
url = "https://www.fonecta.fi/haku/" + l
print(url)
data, links = search_start_url(driver, url, data, links)
place, street, zipc = clean_loc(data["addresses"])
for l in last:
for p in place:
url = "https://www.fonecta.fi/haku/" + l + "?location="+p
data, links = search_start_url(driver, url, data, links)
time.sleep(100)
# // *[ @ id = "__next"] / div / div[2] / div[3] / div[2] / div[1] / div[1] / div / div[2] / div / div[2] / a[1]
driver.close()
pass
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
crawl_selenium()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/

Event Timeline