From f3826ed74fe3ea9d7cff9d0d2a9ec8e1a4efc241 Mon Sep 17 00:00:00 2001 From: Mark Marlow Date: Sun, 18 Oct 2020 10:53:34 +1100 Subject: [PATCH 1/2] refactored out to function to allow multipages (#4) --- job-search-web-scraping.py | 73 +++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/job-search-web-scraping.py b/job-search-web-scraping.py index aa4786b..494d657 100644 --- a/job-search-web-scraping.py +++ b/job-search-web-scraping.py @@ -2,35 +2,76 @@ from selenium.webdriver.common.keys import Keys -def indeed_job_search(): - - PATH_TO_DRIVER = './geckodriver' +INDEED_URL = "https://www.indeed.com/worldwide" +PATH_TO_DRIVER = "./geckodriver" +WAIT_TIME = 5 +PAGES = 10 - browser = webdriver.Firefox(executable_path=PATH_TO_DRIVER) - browser.get('https://www.indeed.com/worldwide') +def initial_search(search_term, driver_path): + browser = webdriver.Firefox(executable_path=driver_path) - browser.implicitly_wait(5) + browser.get(INDEED_URL) - search_bar = browser.find_element_by_name('q') - search_bar.send_keys('machine learning') + browser.implicitly_wait(WAIT_TIME) + + search_bar = browser.find_element_by_name("q") + search_bar.send_keys(search_term) search_bar.send_keys(Keys.ENTER) - browser.implicitly_wait(5) + browser.implicitly_wait(WAIT_TIME) + return browser - search_results = browser.find_elements_by_xpath('//h2/a') - file = open("job_search.txt", 'a') +def initialize_file(search_term): + file = open(f"job_search_{search_term.replace(' ','_')}.txt", "a") file.write("\n") + return file + + +def write_job(job_element, file): + job_title = job_element.text + job_link = job_element.get_attribute("href") + + file.write("%s | link: %s \n" % (job_title, job_link)) + - for job_element in search_results: +def get_jobs(browser): + return browser.find_elements_by_xpath("//h2/a") - job_title = job_element.text - job_link = job_element.get_attribute('href') - file.write("%s | link: %s \n" %(job_title, job_link)) +def close_popup_if_present(browser): + try: + popup_cross = browser.find_element_by_class_name("popover-x-button-close") + popup_cross.click() + except: + pass + +def clean_up(browser, file): browser.close() + file.close() + + +def indeed_job_search(search_term, pages=10, driver_path="./geckodriver"): + browser = initial_search(search_term, driver_path) + file = initialize_file(search_term) + page_number = 1 + while True: + search_results = get_jobs(browser) + [write_job(job_element, file) for job_element in search_results] + try: + next_button = browser.find_element_by_xpath("//a[@aria-label='Next']") + if page_number == pages: + clean_up(browser, file) + next_button.click() + page_number += 1 + browser.implicitly_wait(WAIT_TIME) + close_popup_if_present(browser) + except Exception: + clean_up(browser, file) + exit + if __name__ == "__main__": - indeed_job_search() \ No newline at end of file + indeed_job_search("machine learning", PAGES, PATH_TO_DRIVER) From 206bdbb592d5a1f99014c4f0af5334d860b3117b Mon Sep 17 00:00:00 2001 From: Mark Marlow Date: Mon, 26 Oct 2020 12:10:21 +1100 Subject: [PATCH 2/2] updated to look out for cross on popup --- job-search-web-scraping.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/job-search-web-scraping.py b/job-search-web-scraping.py index 494d657..95fe655 100644 --- a/job-search-web-scraping.py +++ b/job-search-web-scraping.py @@ -1,9 +1,12 @@ +import time from selenium import webdriver from selenium.webdriver.common.keys import Keys - +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC INDEED_URL = "https://www.indeed.com/worldwide" -PATH_TO_DRIVER = "./geckodriver" +PATH_TO_DRIVER = "./geckodriver/geckodriver.exe" WAIT_TIME = 5 PAGES = 10 @@ -42,32 +45,42 @@ def get_jobs(browser): def close_popup_if_present(browser): try: - popup_cross = browser.find_element_by_class_name("popover-x-button-close") - popup_cross.click() + + popup_cross = browser.find_elements_by_xpath( + "//button[contains(@class,'popover-x-button-close')]" + ) + popup_cross[0].click() + return True except: - pass + return False def clean_up(browser, file): - browser.close() + try: + browser.close() + except: + pass file.close() def indeed_job_search(search_term, pages=10, driver_path="./geckodriver"): + popup_encountered = False browser = initial_search(search_term, driver_path) file = initialize_file(search_term) page_number = 1 while True: + if not popup_encountered: + time.sleep(WAIT_TIME) + popup_encountered = close_popup_if_present(browser) search_results = get_jobs(browser) [write_job(job_element, file) for job_element in search_results] try: + browser.implicitly_wait(WAIT_TIME) next_button = browser.find_element_by_xpath("//a[@aria-label='Next']") if page_number == pages: clean_up(browser, file) next_button.click() page_number += 1 - browser.implicitly_wait(WAIT_TIME) - close_popup_if_present(browser) except Exception: clean_up(browser, file) exit