Skip to content

Commit ee96d18

Browse files
Merge pull request #57 from realpython/web-scraping-bs4
Web scraping bs4
2 parents 06bcbdf + a5aaa9e commit ee96d18

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

web-scraping-bs4/job_search.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import argparse
4+
5+
6+
def scrape_jobs(location=None):
7+
"""Scrapes Developer job postings from Monster, optionally by location.
8+
9+
:param location: Where the job is located
10+
:type location: str
11+
:return: all job postings from first page that match the search results
12+
:rtype: BeautifulSoup object
13+
"""
14+
if location:
15+
URL = f"https://www.monster.com/jobs/search/\
16+
?q=Software-Developer&where={location}"
17+
else:
18+
URL = f"https://www.monster.com/jobs/search/?q=Software-Developer"
19+
page = requests.get(URL)
20+
21+
soup = BeautifulSoup(page.content, "html.parser")
22+
results = soup.find(id="ResultsContainer")
23+
return results
24+
25+
26+
def filter_jobs_by_keyword(results, word):
27+
"""Filters job postings by word and prints matching job title plus link.
28+
29+
:param results: Parsed HTML container with all job listings
30+
:type results: BeautifulSoup object
31+
:param word: keyword to filter by
32+
:type word: str
33+
:return: None - just meant to print results
34+
:rtype: None
35+
"""
36+
filtered_jobs = results.find_all(
37+
"h2", string=lambda text: word in text.lower()
38+
)
39+
for f_job in filtered_jobs:
40+
link = f_job.find("a")["href"]
41+
print(f_job.text.strip())
42+
print(f"Apply here: {link}\n")
43+
44+
45+
def print_all_jobs(results):
46+
"""Print details of all jobs returned by the search.
47+
48+
The printed details are title, link, company name and location of the job.
49+
50+
:param results: Parsed HTML container with all job listings
51+
:type results: BeautifulSoup object
52+
:return: None - just meant to print results
53+
:rtype: None
54+
"""
55+
job_elems = results.find_all("section", class_="card-content")
56+
57+
for job_elem in job_elems:
58+
# keep in mind that each job_elem is another BeautifulSoup object!
59+
title_elem = job_elem.find("h2", class_="title")
60+
company_elem = job_elem.find("div", class_="company")
61+
location_elem = job_elem.find("div", class_="location")
62+
if None in (title_elem, company_elem, location_elem):
63+
continue
64+
# print(job_elem.prettify()) # to inspect the 'None' element
65+
print(title_elem.text.strip())
66+
link_elem = title_elem.find("a")
67+
print(link_elem["href"])
68+
print(company_elem.text.strip())
69+
print(location_elem.text.strip())
70+
print()
71+
72+
73+
# USE THE SCRIPT AS A COMMAND-LINE INTERFACE
74+
# ----------------------------------------------------------------------------
75+
my_parser = argparse.ArgumentParser(
76+
prog="jobs", description="Find Developer Jobs"
77+
)
78+
my_parser.add_argument(
79+
"-location", metavar="location", type=str, help="The location of the job"
80+
)
81+
my_parser.add_argument(
82+
"-word", metavar="word", type=str, help="What keyword to filter by"
83+
)
84+
85+
args = my_parser.parse_args()
86+
location, keyword = args.location, args.word
87+
88+
results = scrape_jobs(location)
89+
if keyword:
90+
filter_jobs_by_keyword(results, keyword.lower())
91+
else:
92+
print_all_jobs(results)

web-scraping-bs4/scrape_jobs.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
5+
URL = "https://www.monster.com/jobs/search/?q=Software-Developer\
6+
&where=Australia"
7+
page = requests.get(URL)
8+
9+
soup = BeautifulSoup(page.content, "html.parser")
10+
results = soup.find(id="ResultsContainer")
11+
12+
# Look for Python jobs
13+
python_jobs = results.find_all("h2", string=lambda t: "python" in t.lower())
14+
for p_job in python_jobs:
15+
link = p_job.find("a")["href"]
16+
print(p_job.text.strip())
17+
print(f"Apply here: {link}\n")
18+
19+
# Print out all available jobs from the scraped webpage
20+
job_elems = results.find_all("section", class_="card-content")
21+
for job_elem in job_elems:
22+
title_elem = job_elem.find("h2", class_="title")
23+
company_elem = job_elem.find("div", class_="company")
24+
location_elem = job_elem.find("div", class_="location")
25+
if None in (title_elem, company_elem, location_elem):
26+
continue
27+
print(title_elem.text.strip())
28+
print(company_elem.text.strip())
29+
print(location_elem.text.strip())
30+
print()

0 commit comments

Comments
 (0)