main.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 7, 11:59

main.py
View Options

	# This is a sample Python script.

	# Press Shift+F10 to execute it or replace it with your code.
	# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

	from requests_html import HTMLSession
	from selenium import webdriver
	import time
	import json
	import random
	import copy

	def print_hi(name):
	# Use a breakpoint in the code line below to debug your script.
	print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.

	def crawl():
	# in jupyter notebook/spyder there is already a async session
	# https://github.com/psf/requests-html/issues/294#issuecomment-516709659

	url = "https://www.fonecta.fi/haku/a?location=oulu"
	s = HTMLSession()
	r = s.get(url)

	r.html.render(sleep=1)

	print(r.status_code) # should be 200

	l = r.html.find(".ResultFilterContainerComponent_resultFilterOptions__3WUdC")

	print(l)
	print(len(l))

	script = """
	() => {
	$(document).ready(function() {
	$("#submit_button").click();
	})
	}
	"""
	r.html.render(script=script, reload=False)

	# print(r.html.text)

	def append_list(l, ele):
	if ele in l:
	print("already exists")
	else:
	l.append(ele)

	return l

	def crawl_links(driver, l, data):
	time.sleep(random.randint(1, 10))

	names = copy.deepcopy(data["names"])
	addresses = copy.deepcopy(data["addresses"])
	emails = copy.deepcopy(data["emails"])

	driver.get(l)
	time.sleep(1)
	# get address
	try:
	address = driver.find_elements_by_css_selector(
	".profile_header_component_profileAddressRowLnk__2eMaT span")
	tmp = address[1].get_attribute('innerHTML')
	print(tmp)
	addresses = append_list(addresses, tmp)
	except:
	pass

	# get name
	try:
	address = driver.find_elements_by_css_selector(
	".profile_header_component_profile-information-name__1P390")
	tmp = address[0].get_attribute('innerHTML')
	print(tmp)
	names = append_list(names, tmp)
	except:
	pass

	# get the email
	try:
	address = driver.find_elements_by_css_selector(
	".contact_info_row_email__2OtOQ span")
	tmp = address[1].get_attribute('innerHTML')
	print(tmp)
	emails = append_list(emails, tmp)
	except:
	pass

	if False:
	element = address
	driver.execute_script("""
	var element = arguments[0];
	element.parentNode.removeChild(element);
	""", element)

	if False:
	element = address
	attrs = driver.execute_script(
	'var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;',
	element)
	print(attrs)

	time.sleep(1)

	data["names"] = names
	data["addresses"] = addresses
	data["emails"] = emails

	return data

	def search_start_url(driver, url, data, links_json):
	print(url)
	driver.get(url)
	time.sleep(1)
	# confirm the use of cokies ect.
	try:
	button = driver.find_element_by_xpath("// *[ @ id = \"onetrust-accept-btn-handler\"]")
	button.click()
	except:
	pass
	time.sleep(1)

	try:
	button = driver.find_element_by_xpath("//*[@id=\"__next\"]/div/div[2]/div[3]/div[1]/div[1]/div[1]/button[3]")
	print(button)
	time.sleep(1)
	button.click()
	print("clicked on button")
	time.sleep(5)
	print("results")
	links = []
	for i in range(2, 10):
	try:
	href = driver.find_element_by_xpath("// *[ @ id = \"__next\"] / div / div[2] " +
	"/ div[3] / div[2] / div[1] / div[1] / div / div[" + str(
	i) + "] / div / div[2] / a[1]")
	links.append(href.get_attribute('href'))
	except:
	print("failed for link {}".format(i))
	time.sleep(1)

	for l in links:
	if l not in links_json:
	data = crawl_links(driver, l, data)
	links_json.append(l)

	with open('data_fi.json', 'w') as fp:
	json.dump(data, fp)
	print("# names, # addresses, # emails")
	print(str(len(data['names'])) + "/" +
	str(len(data['addresses'])) + "/" +
	str(len(data['emails'])))
	with open('links_fi.json', 'w') as fp:
	json.dump(links_json, fp)

	except:
	links = links_json
	print("failed to perform a search")
	pass

	return data, links

	def crawl_selenium():
	data = {
	"names": [],
	"addresses": [],
	"emails": [],
	}
	links = []

	try:
	with open('data_fi.json', 'r') as fp:
	data = json.load(fp)
	with open('links_fi.json', 'r') as fp:
	links = json.load(fp)
	except:
	pass

	PATH = "C:\Program Files (x86)\chromedriver.exe"
	driver = webdriver.Chrome(PATH)

	abc = "abcdefghijklmnopqrstuvwxyz"
	if False:
	for letter in abc:
	url = "https://www.fonecta.fi/haku/"+letter
	data, links = search_start_url(driver, url, data, links)

	print(data["names"])

	def clean_names(names):
	first = []
	last = []
	for n in names:
	tmp = n.split(" ")
	# if more then 2 spaces are used, it's most likely not a name
	if len(tmp) != 2:
	names.remove(n)
	else:
	first.append(tmp[0])
	last.append(tmp[1])

	return names, list(set(first)), list(set(last))

	def clean_loc(addresses):
	place = []
	street = []
	zipc = []
	for a in addresses:
	tmp = a.split(", ")
	street.append(tmp[0])

	tmp = tmp[1].split("")
	place.append(tmp[-1])
	zipc.append(tmp[-2])

	return place, street, zipc

	data["names"], first, last = clean_names(data["names"])

	random.shuffle(first)
	print(len(last))
	print(len(first))
	print(first)
	if True:
	for f in first:
	url = "https://www.fonecta.fi/haku/"+f
	data, links = search_start_url(driver, url, data, links)

	data["names"], first, last = clean_names(data["names"])
	print(last)

	random.shuffle(last)
	print(len(last))
	print(len(first))

	for l in last:
	url = "https://www.fonecta.fi/haku/" + l
	print(url)
	data, links = search_start_url(driver, url, data, links)

	place, street, zipc = clean_loc(data["addresses"])

	for l in last:
	for p in place:
	url = "https://www.fonecta.fi/haku/" + l + "?location="+p
	data, links = search_start_url(driver, url, data, links)

	time.sleep(100)

	# // *[ @ id = "__next"] / div / div[2] / div[3] / div[2] / div[1] / div[1] / div / div[2] / div / div[2] / a[1]

	driver.close()
	pass



	# Press the green button in the gutter to run the script.
	if __name__ == '__main__':
	print_hi('PyCharm')

	crawl_selenium()

	# See PyCharm help at https://www.jetbrains.com/help/pycharm/

main.pyNo OneTemporaryActions

File Metadata

main.pyView Options

Event Timeline

main.py
No OneTemporary
Actions

main.py
View Options