Fixes

tsbhangu · tsbhangu · commit 2dae7292729e · 2025-10-31T16:09:38.000-07:00
diff --git a/servers/fai/src/fai/models/api/website_api.py b/servers/fai/src/fai/models/api/website_api.py
@@ -10,14 +10,21 @@
 class IndexWebsiteRequest(BaseModel):
     base_url: str = Field(description="The base URL to start indexing from (e.g., 'https://docs.example.com')")
     domain_filter: str | None = Field(
-        default=None, description="Domain to filter the index (e.g., 'docs.example.com')"
+        default=None, description="Domain to filter crawling (e.g., 'docs.example.com'). Defaults to base_url domain."
     )
-    path_include_patterns: list[str] | None = Field(
-        default=None, description="URL patterns to include (e.g., ['/docs/*', '/api/*']). If empty, includes all."
+    path_filter: str | None = Field(
+        default=None,
+        description="Path prefix to restrict crawling (e.g., '/docs'). Only URLs starting with this will be crawled.",
     )
-    path_exclude_patterns: list[str] | None = Field(
-        default=None, description="URL patterns to exclude (e.g., ['/blog/*', '*.pdf'])"
+    url_pattern: str | None = Field(
+        default=None,
+        description="Regex pattern to filter URLs (e.g., 'https://example\\.com/(docs|api)/.*').",
     )
+    chunk_size: int | None = Field(default=1000, description="Size of text chunks for splitting documents")
+    chunk_overlap: int | None = Field(default=200, description="Overlap between consecutive chunks")
+    min_content_length: int | None = Field(default=100, description="Minimum content length to index a page")
+    max_pages: int | None = Field(default=None, description="Maximum number of pages to crawl. None means unlimited.")
+    delay: float | None = Field(default=1.0, description="Delay in seconds between requests")
     version: str | None = Field(default=None, description="Version to tag all indexed pages with")
     product: str | None = Field(default=None, description="Product to tag all indexed pages with")
     authed: bool | None = Field(default=None, description="Whether indexed pages should be auth-gated")
diff --git a/servers/fai/src/fai/routes/website.py b/servers/fai/src/fai/routes/website.py
@@ -73,7 +73,6 @@ async def index_website(
     Returns a job_id to track the crawling progress.
     """
     try:
-        # Check if IndexSourceDb already exists for this base_url
         result = await db.execute(
             select(IndexSourceDb).where(
                 IndexSourceDb.domain == domain,
@@ -83,17 +82,14 @@ async def index_website(
         )
         index_source = result.scalar_one_or_none()
 
-        # Create a job to track the crawling process
         job_id = await job_manager.create_job(db)
 
         if index_source:
-            # Update existing source
             index_source.status = "indexing"
             index_source.last_job_id = job_id
             index_source.config = body.model_dump()
             index_source.updated_at = datetime.now(UTC)
         else:
-            # Create new source
             source_id = str(uuid.uuid4())
             index_source = IndexSourceDb(
                 id=source_id,
@@ -129,43 +125,85 @@ async def _crawl_website_job(
 ) -> None:
     """
     Background job to crawl a website and index its pages.
-
-    TODO: Implement the actual crawling logic:
-    1. Use a web crawler library
-    2. Extract content from pages
-    3. Chunk content if needed
-    4. Create WebsiteSourceDb entries
-    5. Sync to Turbopuffer
-    6. Update job status
     """
+    pages_indexed = 0
+    pages_failed = 0
+
     try:
-        # Placeholder for actual crawling logic
-        # pages = await crawl_website(config.base_url, config.max_depth, ...)
+        from fai.utils.website.crawler import DocumentationCrawler
+
+        crawler = DocumentationCrawler(
+            start_url=config.base_url,
+            domain_filter=config.domain_filter,
+            path_filter=config.path_filter,
+            url_pattern=config.url_pattern,
+            chunk_size=config.chunk_size or 1000,
+            chunk_overlap=config.chunk_overlap or 200,
+            min_content_length=config.min_content_length or 100,
+        )
+
+        loop = asyncio.get_event_loop()
+        chunks = await loop.run_in_executor(
+            None, lambda: crawler.crawl(max_pages=config.max_pages, delay=config.delay or 1.0, verbose=True)
+        )
+
+        LOGGER.info(f"Crawled {len(chunks)} chunks from {config.base_url}")
+
+        for chunk in chunks:
+            try:
+                chunk_id = str(uuid.uuid4())
+
+                metadata = chunk.metadata
+                page_url = metadata.get("url")
+                document_title = metadata.get("document_title")
+
+                keywords_val = metadata.get("keywords")
+                keywords = None
+                if keywords_val and isinstance(keywords_val, str):
+                    keywords = [k.strip() for k in keywords_val.split(",") if k.strip()]
+
+                website_entry = WebsiteDb(
+                    id=chunk_id,
+                    domain=domain,
+                    base_url=config.base_url,
+                    page_url=str(page_url) if page_url else config.base_url,
+                    chunk=chunk.content,
+                    document=chunk.content,
+                    title=str(document_title) if document_title else None,
+                    version=config.version,
+                    product=config.product,
+                    keywords=keywords,
+                    authed=config.authed if config.authed is not None else False,
+                    created_at=datetime.now(UTC),
+                    updated_at=datetime.now(UTC),
+                )
 
-        # For each page:
-        #   - Create WebsiteSourceDb row
-        #   - Add to database
+                db.add(website_entry)
+                pages_indexed += 1
 
-        # await db.commit()
-        # await sync_website_db_to_tpuf(domain, db)
-        # await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
+            except Exception as e:
+                LOGGER.error(f"Failed to create WebsiteDb entry for chunk: {e}")
+                pages_failed += 1
+
+        pages_failed += len(crawler.failed_urls)
+
+        LOGGER.info(f"Created {pages_indexed} WebsiteDb entries, {pages_failed} failed")
+
+        await db.commit()
+        await sync_website_db_to_tpuf(domain, db)
+        await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
 
-        # Update IndexSourceDb on success
         result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
         index_source = result.scalar_one_or_none()
 
         if index_source:
             index_source.status = "active"
             index_source.last_indexed_at = datetime.now(UTC)
             index_source.updated_at = datetime.now(UTC)
-            index_source.last_error = None
-            index_source.last_error_at = None
 
-            # Update metrics with crawl results
-            # This should be populated with actual results from the crawling logic
             index_source.metrics = {
-                "pages_indexed": 0,  # TODO: Update with actual count
-                "pages_failed": 0,  # TODO: Update with actual count
+                "pages_indexed": pages_indexed,
+                "pages_failed": pages_failed,
             }
 
             await db.commit()
@@ -174,15 +212,18 @@ async def _crawl_website_job(
     except Exception:
         LOGGER.exception(f"Failed to complete website crawl job {job_id}")
 
-        # Update IndexSourceDb on failure
         result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
         index_source = result.scalar_one_or_none()
 
         if index_source:
             index_source.status = "failed"
-            index_source.last_error = str(e)
-            index_source.last_error_at = datetime.now(UTC)
             index_source.updated_at = datetime.now(UTC)
+
+            index_source.metrics = {
+                "pages_indexed": pages_indexed,
+                "pages_failed": pages_failed,
+            }
+
             await db.commit()
 
 
@@ -225,7 +266,7 @@ async def get_website_status(
             error = None
         else:
             status = index_source.status
-            error = index_source.last_error or job.error
+            error = job.error
 
         response = GetWebsiteStatusResponse(
             job_id=job.id,