-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindeed.py
More file actions
54 lines (45 loc) · 1.78 KB
/
indeed.py
File metadata and controls
54 lines (45 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
from bs4 import BeautifulSoup
# requests를 통한 크롤링할 사이트 지정
LIMIT = 50
def get_last_page(url):
result = requests.get(url)
# 어떤 형식으로 출력할 것인가
soup = BeautifulSoup(result.text, "html.parser")
# 어떤 구문을 찾을 것인가
pagination = soup.find("ul", {"class": "pagination-list"})
# pagination 안에서 어떤 것을 찾을 것인가
links = pagination.find_all('a')
# pages라는 배열에 찾고자하는 것을 저장
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_job(html):
title = html.find("h2", {"class": "title"}).find("a")["title"]
company = html.find("span", {"class": "company"})
if company.find("a") is not None:
company = str(company.find("a").string)
else:
company = str(company.string)
company = company.strip() # To erase blank
location = html.find("div", {"class": "recJobLoc"})["data-rc-loc"]
job_id = html["data-jk"]
return {'title': title, 'company': company, 'location': location, "link": f"https://kr.indeed.com/채용보기?jk={job_id}"}
def extract_jobs(last_page, url):
jobs = []
for page in range(last_page):
print(f"indeed {page}번째 페이지 scrapping 중")
result = requests.get(f"{url}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs(word):
url = f"https://kr.indeed.com/jobs?q={word}&limit={LIMIT}"
last_page = get_last_page(url)
jobs = extract_jobs(last_page, url)
return jobs