-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_source.py
More file actions
78 lines (64 loc) · 3.97 KB
/
scrape_source.py
File metadata and controls
78 lines (64 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func
from app.database import Base
class ScrapeSource(Base):
__tablename__ = "scrape_sources"
id = Column(Integer, primary_key=True, index=True)
name = Column(String(255), nullable=False)
base_url = Column(String(1000), nullable=False)
scraper_class = Column(String(255), nullable=False, default="GenericScraper")
is_active = Column(Boolean, default=True)
# AI-generated custom scraper code (for sites that can't use GenericScraper)
custom_scraper_code = Column(Text, nullable=True)
last_scraped_at = Column(DateTime, nullable=True)
last_scrape_success = Column(Boolean, nullable=True) # True=success, False=fail, None=never run
created_at = Column(DateTime, server_default=func.now())
# GenericScraper configuration - CSS selectors for parsing job listings
# The listing_url is the page containing job listings (can be same as base_url)
listing_url = Column(String(1000), nullable=True)
# CSS selector for individual job containers (e.g., ".job-card", "tr.job-row")
selector_job_container = Column(String(500), nullable=True)
# CSS selectors for fields within each job container
selector_title = Column(String(500), nullable=True)
selector_url = Column(String(500), nullable=True)
selector_organization = Column(String(500), nullable=True)
selector_location = Column(String(500), nullable=True)
selector_job_type = Column(String(500), nullable=True)
selector_salary = Column(String(500), nullable=True)
selector_description = Column(String(500), nullable=True)
# Optional: attribute to extract URL from (default: "href")
url_attribute = Column(String(100), nullable=True, default="href")
# Optional: pagination selector for multi-page listings
selector_next_page = Column(String(500), nullable=True)
# Optional: max pages to scrape (default: 10)
max_pages = Column(Integer, nullable=True, default=10)
# Use Playwright (headless browser) instead of httpx for fetching
# Enabled by default - most modern job sites use JavaScript rendering
use_playwright = Column(Boolean, default=True)
# Default location to use when scraper doesn't extract location from page
# e.g., "Bethel" for City of Bethel jobs, "Kotzebue" for City of Kotzebue
default_location = Column(String(255), nullable=True)
# Default state to use when scraper doesn't extract state from page
# e.g., "AK" for Alaska-only job boards like YKHC
default_state = Column(String(50), nullable=True)
# Blocked by robots.txt - site explicitly disallows crawling
# These are kept separate from is_active=False (manually disabled)
robots_blocked = Column(Boolean, default=False)
robots_blocked_at = Column(DateTime, nullable=True) # When we detected the block
# Needs configuration - bulk imported sources that haven't been set up yet
# These are kept separate from is_active=False (tried and disabled)
needs_configuration = Column(Boolean, default=False)
# Skip robots.txt check - for sites with overly restrictive robots.txt
# that block all crawlers but are clearly intended to be public job boards
# (e.g., Oracle E-Business Suite careers pages with blanket Disallow: /)
skip_robots_check = Column(Boolean, default=False)
# SitemapScraper configuration - for sites with XML sitemaps containing job URLs
# sitemap_url: URL of the sitemap XML file (e.g., /sitemaps/jobs_1.xml)
sitemap_url = Column(String(1000), nullable=True)
# sitemap_url_pattern: Regex to filter URLs (e.g., "-ak/" for Alaska jobs only)
sitemap_url_pattern = Column(String(500), nullable=True)
# organization: Organization name to use for all jobs from this source
# (used when organization can't be extracted from URL)
organization = Column(String(255), nullable=True)
jobs = relationship("Job", back_populates="source", cascade="all, delete-orphan")