mbuckingham74 · mbuckingham74 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/CLAUDE_STATUS.md b/CLAUDE_STATUS.md
@@ -349,7 +349,12 @@ cd backend && pytest tests/ -v
 - `backend/scraper/playwright_fetcher.py` - Python client for Playwright service
 - `backend/scraper/sources/generic.py` - `_fetch_page()` with Playwright/httpx logic
 - `backend/scraper/runner.py` - Always enables Playwright for all scrapers
-- `backend/alembic/versions/006_add_use_playwright.py` - Migration (legacy)
+- `backend/alembic/versions/019_enable_playwright_by_default.py` - Sets `use_playwright=True` for all sources
+
+**Database Default:**
+- `use_playwright` column defaults to `True` for new sources (migration 019)
+- All existing sources were updated to `use_playwright=True`
+- The toggle exists in admin for rare cases where httpx-only is needed
 
 **Interactive Page Features (Playwright):**
 - `selectActions` - Array of `{selector, value}` for dropdown selection before page extraction

diff --git a/backend/alembic/versions/019_enable_playwright_by_default.py b/backend/alembic/versions/019_enable_playwright_by_default.py
@@ -0,0 +1,52 @@
+"""Enable Playwright by default for all sources
+
+This migration:
+1. Sets use_playwright=True for all existing sources that have it False or NULL
+2. Changes the column default to True for new sources
+
+Playwright is required for most modern job sites that use JavaScript rendering.
+Without it, the scraper only gets the initial HTML before JS executes, missing
+dynamically loaded job listings.
+
+Revision ID: 019
+Revises: 018
+Create Date: 2025-12-01
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '019'
+down_revision = '018'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Enable Playwright for all existing sources
+    op.execute(
+        "UPDATE scrape_sources SET use_playwright = TRUE WHERE use_playwright = FALSE OR use_playwright IS NULL"
+    )
+
+    # Change the column default to True for new sources
+    op.alter_column(
+        'scrape_sources',
+        'use_playwright',
+        server_default=sa.text('1'),  # MySQL uses 1 for True
+        existing_type=sa.Boolean(),
+        existing_nullable=True
+    )
+
+
+def downgrade() -> None:
+    # Revert column default to False
+    op.alter_column(
+        'scrape_sources',
+        'use_playwright',
+        server_default=sa.text('0'),
+        existing_type=sa.Boolean(),
+        existing_nullable=True
+    )
+    # Note: We don't revert existing data as that could break working scrapers
diff --git a/backend/app/models/scrape_source.py b/backend/app/models/scrape_source.py
@@ -41,8 +41,8 @@ class ScrapeSource(Base):
     max_pages = Column(Integer, nullable=True, default=10)
 
     # Use Playwright (headless browser) instead of httpx for fetching
-    # Useful for sites with bot protection or JavaScript-rendered content
-    use_playwright = Column(Boolean, default=False)
+    # Enabled by default - most modern job sites use JavaScript rendering
+    use_playwright = Column(Boolean, default=True)
 
     # Default location to use when scraper doesn't extract location from page
     # e.g., "Bethel" for City of Bethel jobs, "Kotzebue" for City of Kotzebue

diff --git a/backend/app/routers/admin.py b/backend/app/routers/admin.py
@@ -1074,9 +1074,9 @@ async def trigger_single_source_scrape(source_id: int, request: Request, db: Ses
 
         duration = time.time() - start_time
 
-        # Auto-enable source if it was in needs_configuration and scrape was successful
+        # Auto-enable source if it was in needs_configuration and jobs were found
         auto_enabled = False
-        if source.needs_configuration and result.jobs_found > 0 and not result.errors:
+        if source.needs_configuration and result.jobs_found > 0:
             source.is_active = True
             source.needs_configuration = False
             auto_enabled = True

diff --git a/backend/app/templates/admin/configure_source.html b/backend/app/templates/admin/configure_source.html
@@ -243,6 +243,17 @@ <h3 class="text-lg font-semibold dark:text-white mb-2">Scraper Type</h3>
                 <strong>Dynamic:</strong> Uses AI-generated custom scraper code. Only use if others don't work.
             </p>
         </div>
+        <div class="mt-4">
+            <label class="flex items-center gap-3 cursor-pointer">
+                <input type="checkbox" id="use_playwright" name="use_playwright" value="1"
+                       {% if source.use_playwright is not none %}{% if source.use_playwright %}checked{% endif %}{% else %}checked{% endif %}
+                       class="w-4 h-4 text-blue-600 bg-gray-100 dark:bg-gray-700 border-gray-300 dark:border-gray-600 rounded focus:ring-blue-500 focus:ring-2">
+                <span class="text-sm font-medium text-gray-700 dark:text-gray-300">Use Playwright (Headless Browser)</span>
+            </label>
+            <p class="text-xs text-gray-500 dark:text-gray-400 mt-1 ml-7">
+                Enabled by default. Uses a real browser to render JavaScript-heavy pages. Disable only for simple static HTML sites.
+            </p>
+        </div>
     </div>
 
     <!-- Sitemap Configuration (shown when SitemapScraper selected) -->

diff --git a/backend/app/templates/admin/scraper_guide.html b/backend/app/templates/admin/scraper_guide.html
@@ -507,7 +507,7 @@ <h4 class="font-semibold text-gray-900 dark:text-white mb-2">State Abbreviation<
 <!-- Playwright Features -->
 <div class="bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-200 dark:border-gray-700 p-6 mb-8">
     <h3 class="text-lg font-semibold text-gray-900 dark:text-white mb-2">Playwright Features</h3>
-    <p class="text-gray-600 dark:text-gray-400 mb-4">Playwright is a headless browser that renders JavaScript. It's used automatically for all scrapers but provides extra features for DynamicScrapers.</p>
+    <p class="text-gray-600 dark:text-gray-400 mb-4">Playwright is a headless browser that renders JavaScript. <strong>It's enabled by default for all sources</strong> to ensure JavaScript-rendered job listings are properly loaded. DynamicScrapers can also use these additional interactive features:</p>
 
     <div class="grid md:grid-cols-2 gap-4">
         <div class="p-4 bg-gray-50 dark:bg-gray-900 rounded">
@@ -551,7 +551,7 @@ <h3 class="text-lg font-semibold text-gray-900 dark:text-white mb-2">Special Fla
                 <span class="px-2 py-1 bg-blue-100 dark:bg-blue-900 text-blue-800 dark:text-blue-200 rounded text-xs font-medium">use_playwright</span>
             </div>
             <div>
-                <p class="text-sm text-gray-600 dark:text-gray-400">Force Playwright browser rendering. Enabled by default for all scrapers, but can be explicitly set for DynamicScrapers that need it.</p>
+                <p class="text-sm text-gray-600 dark:text-gray-400"><strong>Enabled by default.</strong> All new sources use Playwright browser rendering automatically. This ensures JavaScript-rendered content is properly loaded. Only disable for rare cases where httpx-only is specifically needed.</p>
             </div>
         </div>
         <div class="flex items-start gap-4 p-4 bg-gray-50 dark:bg-gray-900 rounded">
@@ -581,10 +581,10 @@ <h3 class="text-lg font-semibold text-gray-900 dark:text-white mb-4">Troubleshoo
         <div>
             <h4 class="font-medium text-gray-900 dark:text-white mb-1">No jobs found</h4>
             <ul class="text-sm text-gray-600 dark:text-gray-400 list-disc list-inside">
-                <li>Check if the page requires JavaScript - enable Playwright</li>
-                <li>Verify CSS selectors match actual page structure</li>
+                <li>Verify CSS selectors match actual page structure (use browser DevTools)</li>
                 <li>Check for robots.txt blocking in scrape history</li>
                 <li>Try "Analyze Page with AI" for selector suggestions</li>
+                <li>Playwright is enabled by default - if issues persist, check Playwright service logs</li>
             </ul>
         </div>
         <div>

diff --git a/backend/scraper/runner.py b/backend/scraper/runner.py
@@ -272,8 +272,8 @@ def get_source_config(source: ScrapeSource) -> dict:
         "url_attribute": source.url_attribute,
         "selector_next_page": source.selector_next_page,
         "max_pages": source.max_pages,
-        # Always use Playwright - overhead is minimal vs failing on JS sites
-        "use_playwright": True,
+        # Use Playwright by default (True), but respect database setting for rare httpx-only cases
+        "use_playwright": source.use_playwright if source.use_playwright is not None else True,
         "default_location": source.default_location,
         "default_state": source.default_state,
         # SitemapScraper configuration
@@ -341,7 +341,9 @@ def _run_adp_scraper(
 
     source.last_scraped_at = datetime.now(timezone.utc)
 
-    source.last_scrape_success = len(all_errors) == 0
+    # Success if jobs were found, even with warnings
+    jobs_found = jobs_new + jobs_updated + jobs_unchanged
+    source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
     duration = time.time() - start_time
 
     logger.info(
@@ -419,7 +421,10 @@ def _run_ultipro_scraper(
             logger.exception(f"UltiPro scraper failed for {source.name} URL: {listing_url}")
 
     source.last_scraped_at = datetime.now(timezone.utc)
-    source.last_scrape_success = len(all_errors) == 0
+
+    # Success if jobs were found, even with warnings
+    jobs_found = jobs_new + jobs_updated + jobs_unchanged
+    source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
     duration = time.time() - start_time
 
     logger.info(
@@ -497,7 +502,10 @@ def _run_workday_scraper(
             logger.exception(f"Workday scraper failed for {source.name} URL: {listing_url}")
 
     source.last_scraped_at = datetime.now(timezone.utc)
-    source.last_scrape_success = len(all_errors) == 0
+
+    # Success if jobs were found, even with warnings
+    jobs_found = jobs_new + jobs_updated + jobs_unchanged
+    source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
     duration = time.time() - start_time
 
     logger.info(
@@ -678,7 +686,9 @@ def run_scraper(db: Session, source: ScrapeSource, trigger_type: str = "manual")
         all_errors.append(f"Scraper execution failed: {e}")
 
     # Update source's last_scrape_success status
-    source.last_scrape_success = len(all_errors) == 0
+    # Success if jobs were found, even with warnings
+    jobs_found = jobs_new + jobs_updated + jobs_unchanged
+    source.last_scrape_success = jobs_found > 0 or len(all_errors) == 0
 
     duration = time.time() - start_time
 

diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py
@@ -521,11 +521,40 @@ def test_source_default_values(self, db):
 
         assert source.scraper_class == "GenericScraper"
         assert source.is_active is True
-        assert source.use_playwright is False
+        assert source.use_playwright is True  # Default to True for JS-rendered sites
         assert source.max_pages == 10
         assert source.url_attribute == "href"
         assert source.created_at is not None
 
+    def test_source_playwright_default_is_true(self, db):
+        """New sources should have use_playwright=True by default.
+
+        Most modern job sites use JavaScript rendering, so Playwright
+        should be enabled by default to avoid missing dynamically loaded content.
+        """
+        source = ScrapeSource(
+            name="Playwright Default Test",
+            base_url="https://example.com",
+        )
+        db.add(source)
+        db.commit()
+        db.refresh(source)
+
+        assert source.use_playwright is True, "New sources should default to use_playwright=True"
+
+    def test_source_playwright_can_be_disabled(self, db):
+        """Sources can explicitly disable Playwright for rare httpx-only cases."""
+        source = ScrapeSource(
+            name="No Playwright Source",
+            base_url="https://example.com",
+            use_playwright=False,
+        )
+        db.add(source)
+        db.commit()
+        db.refresh(source)
+
+        assert source.use_playwright is False, "Should be able to explicitly disable Playwright"
+
     def test_source_jobs_relationship(self, db):
         """ScrapeSource has jobs relationship."""
         source = ScrapeSource(