-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper_startup.py
More file actions
83 lines (69 loc) · 3.82 KB
/
scraper_startup.py
File metadata and controls
83 lines (69 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from utils import check_language_requirements, extract_emails, clean_text, is_likely_target_language
from config import HEADLESS, REQUEST_TIMEOUT, STARTUP_JOBS_URL
async def scrape_startup_jobs(search_term="Frontend", location="Germany", target_lang="English"):
"""
Scrapes StartupJobs.com for tech startup jobs.
"""
results = []
base_url = STARTUP_JOBS_URL.format(keyword=search_term, location=location)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=HEADLESS)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
page = await context.new_page()
# Optimization: Block unnecessary resources
await page.route("**/*.{png,jpg,jpeg,gif,svg,css,woff2}", lambda route: route.abort())
print(f"Navigating to StartupJobs: {base_url}")
try:
await page.goto(base_url, wait_until="domcontentloaded", timeout=REQUEST_TIMEOUT)
# Wait for job list
await page.wait_for_selector(".job-list-item", timeout=10000)
except Exception as e:
print(f"Error navigating to StartupJobs: {e}")
await browser.close()
return []
job_cards = await page.query_selector_all(".job-list-item")
print(f"Found {len(job_cards)} potential job listings on StartupJobs.")
for card in job_cards[:8]: # Limiting to 8 for demonstration/efficiency
title_el = await card.query_selector(".job-list-item-title")
company_el = await card.query_selector(".job-list-item-company")
link_el = await card.query_selector("a")
title = await title_el.inner_text() if title_el else ""
company = await company_el.inner_text() if company_el else ""
link = await link_el.get_attribute("href") if link_el else ""
# EARLY EXIT: Snippet check
if not is_likely_target_language(title, target_lang):
continue
if link:
if not link.startswith("http"):
link = "https://www.startupjobs.com" + link
detail_page = await context.new_page()
await detail_page.route("**/*.{png,jpg,jpeg,gif,svg,css,woff2}", lambda route: route.abort())
try:
await detail_page.goto(link, wait_until="domcontentloaded", timeout=30000)
description_el = await detail_page.query_selector(".job-description") or await detail_page.query_selector("body")
description = await description_el.inner_text() if description_el else ""
if check_language_requirements(description, target_lang):
emails = extract_emails(description)
results.append({
"title": clean_text(title),
"company": clean_text(company),
"link": link,
"emails": emails,
"location": location,
"source": "StartupJobs"
})
except Exception as e:
print(f"Error scraping detail {link}: {e}")
finally:
await detail_page.close()
await browser.close()
return results
if __name__ == "__main__":
jobs = asyncio.run(scrape_startup_jobs())
for job in jobs:
print(f"--- \nTitle: {job['title']}\nCompany: {job['company']}\nEmails: {job['emails']}\nLink: {job['link']}\n")