-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcostco.py
More file actions
123 lines (94 loc) · 3.97 KB
/
costco.py
File metadata and controls
123 lines (94 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
from bs4 import BeautifulSoup
URL = "https://chm.tbe.taleo.net/chm02/ats/careers/v2/searchResults?org=COSTCO&cws=41"
BASE_URL = "https://chm.tbe.taleo.net"
DEBUG = False
class Parser:
def __init__(self):
self.html_text = ""
self.session = requests.Session()
def open_local_html(self, name):
with open(name, "r") as file:
return file.read()
def post(self, url, data):
res = self.session.post(url, data=data)
res.raise_for_status()
if DEBUG:
with open("debug.html", "w") as file:
file.write(res.text)
self.soup = BeautifulSoup(res.text, "html.parser")
return res.content
def get(self, url, **kwargs):
res = self.session.get(url, **kwargs)
res.raise_for_status()
if DEBUG:
with open("debug_get.html", "w") as file:
file.write(res.text)
self.soup = BeautifulSoup(res.text, "html.parser")
return res.text
def find_selectors(self, el, config, soup=None):
if not soup:
soup = self.soup
classes = soup.findAll(el, config)
return classes
class CostcoPaser:
def __init__(self, job_title):
self.parser = Parser()
self.search_data = {"act": "search",
"org": "COSTO",
"cws": 41,
"WebPage": "JSRCH_V2",
"WebVersion": 2,
"keywords": job_title}
self.res = []
def _html_to_links(self):
soaped_links = self.parser.find_selectors("div", {"class": "oracletaleocwsv2-accordion-head-info"})
soaped_next = self.parser.find_selectors("a", {"class": "jscroll-next"})
for soap_link in soaped_links:
divs = [x.text for x in soap_link.findChildren("div")]
self.res.append(
{"url": soap_link.h4.a["href"],
"category": divs[0],
"location": divs[1],
"title": soap_link.h4.text
})
return soaped_next
def _one_job_html_to_json(self):
pre_number_soap = self.parser.find_selectors("div", {"class": "well oracletaleocwsv2-job-description"})
number = self.parser.find_selectors(
"div", {"class": "col-xs-12 col-sm-4 col-md-12"}, pre_number_soap[0])[1].strong.text
pre_description = self.parser.find_selectors("div", {"class": "col-xs-12 col-sm-12 col-md-8"})
res = pre_description[0]
# Remove all scripts from description
for _ in res.findChildren("script"):
res.script.decompose()
# Remove all buttons and shares from description
for match in self.parser.find_selectors("div", {
"class": "oracletaleocwsv2-button-navigation oracletaleocwsv2-job-description clearfix"}, soup=res):
match.decompose()
desc = res.text
return {"number": number,
"description": desc}
def analyze_job_links(self):
"""Iterate through links and extract data to self.res"""
for item in self.res:
self.parser.get(item["url"])
add_info = self._one_job_html_to_json()
item["order_number"] = add_info["number"].replace('\n', '')
item["description"] = add_info["description"]
def process(self):
self.parser.post(url=URL, data=self.search_data)
soaped_next = self._html_to_links()
# Collect links to jobs to self.job_links
if soaped_next:
url = "{}{}".format(BASE_URL, soaped_next[0]["href"])
while soaped_next:
self.parser.get(url)
soaped_next = self._html_to_links()
if soaped_next:
url = "{}{}".format(BASE_URL, soaped_next[0]["href"])
self.analyze_job_links()
return self.res
def main(job_title):
return CostcoPaser(job_title=job_title).process()
#Example: main("software engineer")