Far-Reach-Jobs/backend/app/models/scrape_source.py at 7fbcd5f80a8f8b320bee3e70ed7a2d58b8df4f7f · mbuckingham74/Far-Reach-Jobs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func

from app.database import Base


class ScrapeSource(Base):
    __tablename__ = "scrape_sources"

    id = Column(Integer, primary_key=True, index=True)
    name = Column(String(255), nullable=False)
    base_url = Column(String(1000), nullable=False)
    scraper_class = Column(String(255), nullable=False, default="GenericScraper")
    is_active = Column(Boolean, default=True)

    # AI-generated custom scraper code (for sites that can't use GenericScraper)
    custom_scraper_code = Column(Text, nullable=True)
    last_scraped_at = Column(DateTime, nullable=True)
    last_scrape_success = Column(Boolean, nullable=True)  # True=success, False=fail, None=never run
    created_at = Column(DateTime, server_default=func.now())

    # GenericScraper configuration - CSS selectors for parsing job listings
    # The listing_url is the page containing job listings (can be same as base_url)
    listing_url = Column(String(1000), nullable=True)
    # CSS selector for individual job containers (e.g., ".job-card", "tr.job-row")
    selector_job_container = Column(String(500), nullable=True)
    # CSS selectors for fields within each job container
    selector_title = Column(String(500), nullable=True)
    selector_url = Column(String(500), nullable=True)
    selector_organization = Column(String(500), nullable=True)
    selector_location = Column(String(500), nullable=True)
    selector_job_type = Column(String(500), nullable=True)
    selector_salary = Column(String(500), nullable=True)
    selector_description = Column(String(500), nullable=True)
    # Optional: attribute to extract URL from (default: "href")
    url_attribute = Column(String(100), nullable=True, default="href")
    # Optional: pagination selector for multi-page listings
    selector_next_page = Column(String(500), nullable=True)
    # Optional: max pages to scrape (default: 10)
    max_pages = Column(Integer, nullable=True, default=10)

    # Use Playwright (headless browser) instead of httpx for fetching
    # Enabled by default - most modern job sites use JavaScript rendering
    use_playwright = Column(Boolean, default=True)

    # Default location to use when scraper doesn't extract location from page
    # e.g., "Bethel" for City of Bethel jobs, "Kotzebue" for City of Kotzebue
    default_location = Column(String(255), nullable=True)

    # Default state to use when scraper doesn't extract state from page
    # e.g., "AK" for Alaska-only job boards like YKHC
    default_state = Column(String(50), nullable=True)

    # Blocked by robots.txt - site explicitly disallows crawling
    # These are kept separate from is_active=False (manually disabled)
    robots_blocked = Column(Boolean, default=False)
    robots_blocked_at = Column(DateTime, nullable=True)  # When we detected the block

    # Needs configuration - bulk imported sources that haven't been set up yet
    # These are kept separate from is_active=False (tried and disabled)
    needs_configuration = Column(Boolean, default=False)

    # Skip robots.txt check - for sites with overly restrictive robots.txt
    # that block all crawlers but are clearly intended to be public job boards
    # (e.g., Oracle E-Business Suite careers pages with blanket Disallow: /)
    skip_robots_check = Column(Boolean, default=False)

    # SitemapScraper configuration - for sites with XML sitemaps containing job URLs
    # sitemap_url: URL of the sitemap XML file (e.g., /sitemaps/jobs_1.xml)
    sitemap_url = Column(String(1000), nullable=True)
    # sitemap_url_pattern: Regex to filter URLs (e.g., "-ak/" for Alaska jobs only)
    sitemap_url_pattern = Column(String(500), nullable=True)
    # organization: Organization name to use for all jobs from this source
    # (used when organization can't be extracted from URL)
    organization = Column(String(255), nullable=True)

    jobs = relationship("Job", back_populates="source", cascade="all, delete-orphan")