Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CLAUDE_STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,12 @@ cd backend && pytest tests/ -v
- `backend/scraper/playwright_fetcher.py` - Python client for Playwright service
- `backend/scraper/sources/generic.py` - `_fetch_page()` with Playwright/httpx logic
- `backend/scraper/runner.py` - Always enables Playwright for all scrapers
- `backend/alembic/versions/006_add_use_playwright.py` - Migration (legacy)
- `backend/alembic/versions/019_enable_playwright_by_default.py` - Sets `use_playwright=True` for all sources

**Database Default:**
- `use_playwright` column defaults to `True` for new sources (migration 019)
- All existing sources were updated to `use_playwright=True`
- The toggle exists in admin for rare cases where httpx-only is needed

**Interactive Page Features (Playwright):**
- `selectActions` - Array of `{selector, value}` for dropdown selection before page extraction
Expand Down
52 changes: 52 additions & 0 deletions backend/alembic/versions/019_enable_playwright_by_default.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Enable Playwright by default for all sources

This migration:
1. Sets use_playwright=True for all existing sources that have it False or NULL
2. Changes the column default to True for new sources

Playwright is required for most modern job sites that use JavaScript rendering.
Without it, the scraper only gets the initial HTML before JS executes, missing
dynamically loaded job listings.

Revision ID: 019
Revises: 018
Create Date: 2025-12-01

"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '019'
down_revision = '018'
branch_labels = None
depends_on = None


def upgrade() -> None:
# Enable Playwright for all existing sources
op.execute(
"UPDATE scrape_sources SET use_playwright = TRUE WHERE use_playwright = FALSE OR use_playwright IS NULL"
)

# Change the column default to True for new sources
op.alter_column(
'scrape_sources',
'use_playwright',
server_default=sa.text('1'), # MySQL uses 1 for True
Comment on lines +34 to +37

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Align ORM default with Playwright migration

The migration sets a server default of TRUE for scrape_sources.use_playwright, but the SQLAlchemy model ScrapeSource.use_playwright still declares default=False (backend/app/models/scrape_source.py). Because the admin create/import paths instantiate ScrapeSource without setting this field, SQLAlchemy will send False on insert, overriding the new server default. New sources will therefore continue to have Playwright disabled despite the intent to enable it by default. Update the model (or omit the Python default) so inserts inherit the new default.

Useful? React with 👍 / 👎.

existing_type=sa.Boolean(),
existing_nullable=True
)


def downgrade() -> None:
# Revert column default to False
op.alter_column(
'scrape_sources',
'use_playwright',
server_default=sa.text('0'),
existing_type=sa.Boolean(),
existing_nullable=True
)
# Note: We don't revert existing data as that could break working scrapers
4 changes: 2 additions & 2 deletions backend/app/models/scrape_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ class ScrapeSource(Base):
max_pages = Column(Integer, nullable=True, default=10)

# Use Playwright (headless browser) instead of httpx for fetching
# Useful for sites with bot protection or JavaScript-rendered content
use_playwright = Column(Boolean, default=False)
# Enabled by default - most modern job sites use JavaScript rendering
use_playwright = Column(Boolean, default=True)

# Default location to use when scraper doesn't extract location from page
# e.g., "Bethel" for City of Bethel jobs, "Kotzebue" for City of Kotzebue
Expand Down
4 changes: 2 additions & 2 deletions backend/app/routers/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,9 +1074,9 @@ async def trigger_single_source_scrape(source_id: int, request: Request, db: Ses

duration = time.time() - start_time

# Auto-enable source if it was in needs_configuration and scrape was successful
# Auto-enable source if it was in needs_configuration and jobs were found
auto_enabled = False
if source.needs_configuration and result.jobs_found > 0 and not result.errors:
if source.needs_configuration and result.jobs_found > 0:
source.is_active = True
source.needs_configuration = False
auto_enabled = True
Expand Down
11 changes: 11 additions & 0 deletions backend/app/templates/admin/configure_source.html
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,17 @@ <h3 class="text-lg font-semibold dark:text-white mb-2">Scraper Type</h3>
<strong>Dynamic:</strong> Uses AI-generated custom scraper code. Only use if others don't work.
</p>
</div>
<div class="mt-4">
<label class="flex items-center gap-3 cursor-pointer">
<input type="checkbox" id="use_playwright" name="use_playwright" value="1"
{% if source.use_playwright is not none %}{% if source.use_playwright %}checked{% endif %}{% else %}checked{% endif %}
class="w-4 h-4 text-blue-600 bg-gray-100 dark:bg-gray-700 border-gray-300 dark:border-gray-600 rounded focus:ring-blue-500 focus:ring-2">
<span class="text-sm font-medium text-gray-700 dark:text-gray-300">Use Playwright (Headless Browser)</span>
</label>
<p class="text-xs text-gray-500 dark:text-gray-400 mt-1 ml-7">
Enabled by default. Uses a real browser to render JavaScript-heavy pages. Disable only for simple static HTML sites.
</p>
</div>
</div>

<!-- Sitemap Configuration (shown when SitemapScraper selected) -->
Expand Down
8 changes: 4 additions & 4 deletions backend/app/templates/admin/scraper_guide.html
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ <h4 class="font-semibold text-gray-900 dark:text-white mb-2">State Abbreviation<
<!-- Playwright Features -->
<div class="bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-200 dark:border-gray-700 p-6 mb-8">
<h3 class="text-lg font-semibold text-gray-900 dark:text-white mb-2">Playwright Features</h3>
<p class="text-gray-600 dark:text-gray-400 mb-4">Playwright is a headless browser that renders JavaScript. It's used automatically for all scrapers but provides extra features for DynamicScrapers.</p>
<p class="text-gray-600 dark:text-gray-400 mb-4">Playwright is a headless browser that renders JavaScript. <strong>It's enabled by default for all sources</strong> to ensure JavaScript-rendered job listings are properly loaded. DynamicScrapers can also use these additional interactive features:</p>

<div class="grid md:grid-cols-2 gap-4">
<div class="p-4 bg-gray-50 dark:bg-gray-900 rounded">
Expand Down Expand Up @@ -551,7 +551,7 @@ <h3 class="text-lg font-semibold text-gray-900 dark:text-white mb-2">Special Fla
<span class="px-2 py-1 bg-blue-100 dark:bg-blue-900 text-blue-800 dark:text-blue-200 rounded text-xs font-medium">use_playwright</span>
</div>
<div>
<p class="text-sm text-gray-600 dark:text-gray-400">Force Playwright browser rendering. Enabled by default for all scrapers, but can be explicitly set for DynamicScrapers that need it.</p>
<p class="text-sm text-gray-600 dark:text-gray-400"><strong>Enabled by default.</strong> All new sources use Playwright browser rendering automatically. This ensures JavaScript-rendered content is properly loaded. Only disable for rare cases where httpx-only is specifically needed.</p>
</div>
</div>
<div class="flex items-start gap-4 p-4 bg-gray-50 dark:bg-gray-900 rounded">
Expand Down Expand Up @@ -581,10 +581,10 @@ <h3 class="text-lg font-semibold text-gray-900 dark:text-white mb-4">Troubleshoo
<div>
<h4 class="font-medium text-gray-900 dark:text-white mb-1">No jobs found</h4>
<ul class="text-sm text-gray-600 dark:text-gray-400 list-disc list-inside">
<li>Check if the page requires JavaScript - enable Playwright</li>
<li>Verify CSS selectors match actual page structure</li>
<li>Verify CSS selectors match actual page structure (use browser DevTools)</li>
<li>Check for robots.txt blocking in scrape history</li>
<li>Try "Analyze Page with AI" for selector suggestions</li>
<li>Playwright is enabled by default - if issues persist, check Playwright service logs</li>
</ul>
</div>
<div>
Expand Down
22 changes: 16 additions & 6 deletions backend/scraper/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,8 @@ def get_source_config(source: ScrapeSource) -> dict:
"url_attribute": source.url_attribute,
"selector_next_page": source.selector_next_page,
"max_pages": source.max_pages,
# Always use Playwright - overhead is minimal vs failing on JS sites
"use_playwright": True,
# Use Playwright by default (True), but respect database setting for rare httpx-only cases
"use_playwright": source.use_playwright if source.use_playwright is not None else True,
"default_location": source.default_location,
"default_state": source.default_state,
# SitemapScraper configuration
Expand Down Expand Up @@ -341,7 +341,9 @@ def _run_adp_scraper(

source.last_scraped_at = datetime.now(timezone.utc)

source.last_scrape_success = len(all_errors) == 0
# Success if jobs were found, even with warnings
jobs_found = jobs_new + jobs_updated + jobs_unchanged
source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
duration = time.time() - start_time

logger.info(
Expand Down Expand Up @@ -419,7 +421,10 @@ def _run_ultipro_scraper(
logger.exception(f"UltiPro scraper failed for {source.name} URL: {listing_url}")

source.last_scraped_at = datetime.now(timezone.utc)
source.last_scrape_success = len(all_errors) == 0

# Success if jobs were found, even with warnings
jobs_found = jobs_new + jobs_updated + jobs_unchanged
source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
duration = time.time() - start_time

logger.info(
Expand Down Expand Up @@ -497,7 +502,10 @@ def _run_workday_scraper(
logger.exception(f"Workday scraper failed for {source.name} URL: {listing_url}")

source.last_scraped_at = datetime.now(timezone.utc)
source.last_scrape_success = len(all_errors) == 0

# Success if jobs were found, even with warnings
jobs_found = jobs_new + jobs_updated + jobs_unchanged
source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
duration = time.time() - start_time

logger.info(
Expand Down Expand Up @@ -678,7 +686,9 @@ def run_scraper(db: Session, source: ScrapeSource, trigger_type: str = "manual")
all_errors.append(f"Scraper execution failed: {e}")

# Update source's last_scrape_success status
source.last_scrape_success = len(all_errors) == 0
# Success if jobs were found, even with warnings
jobs_found = jobs_new + jobs_updated + jobs_unchanged
source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0

duration = time.time() - start_time

Expand Down
31 changes: 30 additions & 1 deletion backend/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,11 +521,40 @@ def test_source_default_values(self, db):

assert source.scraper_class == "GenericScraper"
assert source.is_active is True
assert source.use_playwright is False
assert source.use_playwright is True # Default to True for JS-rendered sites
assert source.max_pages == 10
assert source.url_attribute == "href"
assert source.created_at is not None

def test_source_playwright_default_is_true(self, db):
"""New sources should have use_playwright=True by default.

Most modern job sites use JavaScript rendering, so Playwright
should be enabled by default to avoid missing dynamically loaded content.
"""
source = ScrapeSource(
name="Playwright Default Test",
base_url="https://example.com",
)
db.add(source)
db.commit()
db.refresh(source)

assert source.use_playwright is True, "New sources should default to use_playwright=True"

def test_source_playwright_can_be_disabled(self, db):
"""Sources can explicitly disable Playwright for rare httpx-only cases."""
source = ScrapeSource(
name="No Playwright Source",
base_url="https://example.com",
use_playwright=False,
)
db.add(source)
db.commit()
db.refresh(source)

assert source.use_playwright is False, "Should be able to explicitly disable Playwright"

def test_source_jobs_relationship(self, db):
"""ScrapeSource has jobs relationship."""
source = ScrapeSource(
Expand Down