Changes to use index_source_db

tsbhangu · tsbhangu · commit bd1d3b15afa4 · 2025-10-31T16:07:44.000-07:00
diff --git a/servers/fai/src/fai/models/api/website_api.py b/servers/fai/src/fai/models/api/website_api.py
@@ -8,11 +8,9 @@
 
 
 class IndexWebsiteRequest(BaseModel):
-    base_url: str = Field(description="The base URL to start crawling from (e.g., 'https://docs.example.com')")
-    max_depth: int | None = Field(
-        default=1, description="Maximum depth to crawl from base URL (1 = only pages linked from base URL)"
-    )
-    include_patterns: list[str] | None = Field(
+    base_url: str = Field(description="The base URL to start indexing from (e.g., 'https://docs.example.com')")
+    domain_filter: str | None = Field(default=None, description="Domain to filter the index (e.g., 'docs.example.com')")
+    path_include_patterns: list[str] | None = Field(
         default=None, description="URL patterns to include (e.g., ['/docs/*', '/api/*']). If empty, includes all."
     )
     exclude_patterns: list[str] | None = Field(
diff --git a/servers/fai/src/fai/routes/website.py b/servers/fai/src/fai/routes/website.py
@@ -1,4 +1,9 @@
 import asyncio
+import uuid
+from datetime import (
+    UTC,
+    datetime,
+)
 
 from fastapi import (
     Body,
@@ -33,6 +38,10 @@
     ReindexWebsiteRequest,
     ReindexWebsiteResponse,
 )
+from fai.models.db.index_source_db import (
+    IndexSourceDb,
+    SourceType,
+)
 from fai.models.db.website_db import WebsiteDb
 from fai.settings import LOGGER
 from fai.utils.jobs import job_manager
@@ -63,29 +72,44 @@ async def index_website(
     Returns a job_id to track the crawling progress.
     """
     try:
+        # Check if IndexSourceDb already exists for this base_url
+        result = await db.execute(
+            select(IndexSourceDb).where(
+                IndexSourceDb.domain == domain,
+                IndexSourceDb.source_type == SourceType.WEBSITE,
+                IndexSourceDb.source_identifier == body.base_url,
+            )
+        )
+        index_source = result.scalar_one_or_none()
+
         # Create a job to track the crawling process
         job_id = await job_manager.create_job(db)
 
-        # TODO: Implement the actual crawling logic as a background task
-        # This would typically involve:
-        # 1. Using a web crawler (e.g., Firecrawl, Scrapy, Playwright)
-        # 2. Following links up to max_depth
-        # 3. Applying include/exclude patterns
-        # 4. Extracting content from each page
-        # 5. Chunking if needed
-        # 6. Creating WebsiteSourceDb rows
-        # 7. Syncing to Turbopuffer
-        # 8. Updating job status
-
-        # For now, we'll create a placeholder task
-        asyncio.create_task(
-            _crawl_website_job(
-                job_id=job_id,
+        if index_source:
+            # Update existing source
+            index_source.status = "indexing"
+            index_source.last_job_id = job_id
+            index_source.config = body.model_dump()
+            index_source.updated_at = datetime.now(UTC)
+        else:
+            # Create new source
+            source_id = str(uuid.uuid4())
+            index_source = IndexSourceDb(
+                id=source_id,
                 domain=domain,
-                config=body,
-                db=db,
+                source_type=SourceType.WEBSITE,
+                source_identifier=body.base_url,
+                config=body.model_dump(),
+                status="indexing",
+                last_job_id=job_id,
+                created_at=datetime.now(UTC),
+                updated_at=datetime.now(UTC),
             )
-        )
+            db.add(index_source)
+
+        await db.commit()
+
+        asyncio.create_task(job_manager.execute_job(job_id, _crawl_website_job, index_source.id, domain, body, db))
 
         LOGGER.info(f"Started website crawl job {job_id} for domain: {domain}, base_url: {body.base_url}")
         return JSONResponse(
@@ -99,6 +123,7 @@ async def index_website(
 
 async def _crawl_website_job(
     job_id: str,
+    source_id: str,
     domain: str,
     config: IndexWebsiteRequest,
     db: AsyncSession,
@@ -115,9 +140,6 @@ async def _crawl_website_job(
     6. Update job status
     """
     try:
-        # Update job status to processing
-        # await job_manager.update_job_status(db, job_id, "PROCESSING")
-
         # Placeholder for actual crawling logic
         # pages = await crawl_website(config.base_url, config.max_depth, ...)
 
@@ -129,13 +151,40 @@ async def _crawl_website_job(
         # await sync_website_db_to_tpuf(domain, db)
         # await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
 
-        # Update job status to completed
-        # await job_manager.update_job_status(db, job_id, "COMPLETED")
+        # Update IndexSourceDb on success
+        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
+        index_source = result.scalar_one_or_none()
+
+        if index_source:
+            index_source.status = "active"
+            index_source.last_indexed_at = datetime.now(UTC)
+            index_source.updated_at = datetime.now(UTC)
+            index_source.last_error = None
+            index_source.last_error_at = None
+
+            # Update metrics with crawl results
+            # This should be populated with actual results from the crawling logic
+            index_source.metrics = {
+                "pages_indexed": 0,  # TODO: Update with actual count
+                "pages_failed": 0,  # TODO: Update with actual count
+            }
+
+            await db.commit()
 
         LOGGER.info(f"Completed website crawl job {job_id} for domain: {domain}")
     except Exception as e:
         LOGGER.exception(f"Failed to complete website crawl job {job_id}")
-        # await job_manager.update_job_status(db, job_id, "FAILED", error=str(e))
+
+        # Update IndexSourceDb on failure
+        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
+        index_source = result.scalar_one_or_none()
+
+        if index_source:
+            index_source.status = "failed"
+            index_source.last_error = str(e)
+            index_source.last_error_at = datetime.now(UTC)
+            index_source.updated_at = datetime.now(UTC)
+            await db.commit()
 
 
 @fai_app.get(
@@ -159,19 +208,33 @@ async def get_website_status(
         if not job:
             return JSONResponse(status_code=404, content={"detail": "Job not found"})
 
-        # TODO: Get actual page counts from the database
-        # Count pages indexed for this job's base_url
-        pages_indexed = 0
-        pages_failed = 0
-        base_url = ""  # Should be stored with the job
+        # Find the IndexSourceDb that corresponds to this job
+        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.last_job_id == job_id))
+        index_source = result.scalar_one_or_none()
+
+        if not index_source:
+            return JSONResponse(status_code=404, content={"detail": "Source not found for this job"})
+
+        # Get metrics from IndexSourceDb
+        metrics = index_source.metrics or {}
+        pages_indexed = metrics.get("pages_indexed", 0)
+        pages_failed = metrics.get("pages_failed", 0)
+
+        # Determine status: use job status if in progress, otherwise use source status
+        if job.status.value in ["pending", "in_progress"]:
+            status = job.status.value
+            error = None
+        else:
+            status = index_source.status
+            error = index_source.last_error or job.error
 
         response = GetWebsiteStatusResponse(
             job_id=job.id,
-            status=job.status.value,
-            base_url=base_url,
+            status=status,
+            base_url=index_source.source_identifier,
             pages_indexed=pages_indexed,
             pages_failed=pages_failed,
-            error=job.error,
+            error=error,
         )
 
         return JSONResponse(jsonable_encoder(response))
@@ -300,11 +363,50 @@ async def reindex_website(
 
         LOGGER.info(f"Deleted {len(websites)} pages from {body.base_url} for domain: {domain}")
 
+        # Find or create IndexSourceDb for this base_url
+        result = await db.execute(
+            select(IndexSourceDb).where(
+                IndexSourceDb.domain == domain,
+                IndexSourceDb.source_type == SourceType.WEBSITE,
+                IndexSourceDb.source_identifier == body.base_url,
+            )
+        )
+        index_source = result.scalar_one_or_none()
+
         # Create a new crawl job
         job_id = await job_manager.create_job(db)
 
-        # TODO: Start the crawling job similar to index_website
-        # asyncio.create_task(_crawl_website_job(...))
+        if index_source:
+            # Update existing source
+            index_source.status = "indexing"
+            index_source.last_job_id = job_id
+            index_source.updated_at = datetime.now(UTC)
+            # Reset metrics for reindexing
+            index_source.metrics = {}
+        else:
+            # Create new source if it doesn't exist
+            source_id = str(uuid.uuid4())
+            index_source = IndexSourceDb(
+                id=source_id,
+                domain=domain,
+                source_type=SourceType.WEBSITE,
+                source_identifier=body.base_url,
+                config={"base_url": body.base_url},
+                status="indexing",
+                last_job_id=job_id,
+                created_at=datetime.now(UTC),
+                updated_at=datetime.now(UTC),
+            )
+            db.add(index_source)
+
+        await db.commit()
+
+        # Start the crawling job
+        asyncio.create_task(
+            job_manager.execute_job(
+                job_id, _crawl_website_job, index_source.id, domain, IndexWebsiteRequest(base_url=body.base_url), db
+            )
+        )
 
         LOGGER.info(f"Started website re-crawl job {job_id} for domain: {domain}, base_url: {body.base_url}")
         return JSONResponse(
diff --git a/servers/fai/tests/utils/website/test_markdown_chunker.py b/servers/fai/tests/utils/website/test_markdown_chunker.py
@@ -345,7 +345,7 @@ def test_chunk_overlap_parameter_respected(self) -> None:
         for i in range(len(chunks) - 1):
             # Get the last paragraph(s) of current chunk
             current_paragraphs = chunks[i].split("\n\n")
-            next_paragraphs = chunks[i + 1].split("\n\n")
+            chunks[i + 1].split("\n\n")
 
             # At least one paragraph from current chunk should appear in next chunk
             overlap_found = False