Fix async db sesssion

tsbhangu · tsbhangu · commit 01a17ba7ee3d · 2025-10-31T16:09:38.000-07:00
diff --git a/servers/fai/src/fai/routes/website.py b/servers/fai/src/fai/routes/website.py
@@ -21,6 +21,7 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from fai.app import fai_app
+from fai.db import async_session_maker
 from fai.dependencies import (
     ask_ai_enabled,
     get_db,
@@ -66,7 +67,6 @@ async def index_website(
     body: IndexWebsiteRequest = Body(...),
     db: AsyncSession = Depends(get_db),
     _: None = Depends(verify_token),
-    __: None = Depends(ask_ai_enabled),
 ) -> JSONResponse:
     """
     Start crawling and indexing a website.
@@ -86,7 +86,7 @@ async def index_website(
 
         if index_source:
             index_source.status = "indexing"
-            index_source.last_job_id = job_id
+            index_source.job_id = job_id
             index_source.config = body.model_dump()
             index_source.updated_at = datetime.now(UTC)
         else:
@@ -98,15 +98,17 @@ async def index_website(
                 source_identifier=body.base_url,
                 config=body.model_dump(),
                 status="indexing",
-                last_job_id=job_id,
+                job_id=job_id,
                 created_at=datetime.now(UTC),
                 updated_at=datetime.now(UTC),
             )
             db.add(index_source)
 
         await db.commit()
 
-        asyncio.create_task(job_manager.execute_job(job_id, _crawl_website_job, index_source.id, domain, body, db))
+        asyncio.create_task(
+            job_manager.execute_job(job_id, _crawl_website_job, job_id, index_source.id, domain, body, db)
+        )
 
         LOGGER.info(f"Started website crawl job {job_id} for domain: {domain}, base_url: {body.base_url}")
         return JSONResponse(jsonable_encoder(IndexWebsiteResponse(job_id=job_id, base_url=body.base_url)))
@@ -193,38 +195,42 @@ async def _crawl_website_job(
         await sync_website_db_to_tpuf(domain, db)
         await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
 
-        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
-        index_source = result.scalar_one_or_none()
+        # Get a fresh database session after long-running Turbopuffer sync
+        async with async_session_maker() as fresh_db:
+            result = await fresh_db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
+            index_source = result.scalar_one_or_none()
 
-        if index_source:
-            index_source.status = "active"
-            index_source.last_indexed_at = datetime.now(UTC)
-            index_source.updated_at = datetime.now(UTC)
+            if index_source:
+                index_source.status = "active"
+                index_source.last_indexed_at = datetime.now(UTC)
+                index_source.updated_at = datetime.now(UTC)
 
-            index_source.metrics = {
-                "pages_indexed": pages_indexed,
-                "pages_failed": pages_failed,
-            }
+                index_source.metrics = {
+                    "pages_indexed": pages_indexed,
+                    "pages_failed": pages_failed,
+                }
 
-            await db.commit()
+                await fresh_db.commit()
 
         LOGGER.info(f"Completed website crawl job {job_id} for domain: {domain}")
     except Exception:
         LOGGER.exception(f"Failed to complete website crawl job {job_id}")
 
-        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
-        index_source = result.scalar_one_or_none()
+        # Get a fresh database session in case of failure after Turbopuffer sync
+        async with async_session_maker() as fresh_db:
+            result = await fresh_db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
+            index_source = result.scalar_one_or_none()
 
-        if index_source:
-            index_source.status = "failed"
-            index_source.updated_at = datetime.now(UTC)
+            if index_source:
+                index_source.status = "failed"
+                index_source.updated_at = datetime.now(UTC)
 
-            index_source.metrics = {
-                "pages_indexed": pages_indexed,
-                "pages_failed": pages_failed,
-            }
+                index_source.metrics = {
+                    "pages_indexed": pages_indexed,
+                    "pages_failed": pages_failed,
+                }
 
-            await db.commit()
+                await fresh_db.commit()
 
 
 @fai_app.get(
@@ -233,11 +239,9 @@ async def _crawl_website_job(
     openapi_extra={"x-fern-audiences": ["customers"], "security": [{"bearerAuth": []}]},
 )
 async def get_website_status(
-    domain: str,
     job_id: str = QueryParam(..., description="The job ID returned from the index endpoint"),
     db: AsyncSession = Depends(get_db),
     _: None = Depends(verify_token),
-    __: None = Depends(ask_ai_enabled),
 ) -> JSONResponse:
     """
     Get the status of a website crawling job.
@@ -249,7 +253,7 @@ async def get_website_status(
             return JSONResponse(status_code=404, content={"detail": "Job not found"})
 
         # Find the IndexSourceDb that corresponds to this job
-        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.last_job_id == job_id))
+        result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.job_id == job_id))
         index_source = result.scalar_one_or_none()
 
         if not index_source:
@@ -261,8 +265,8 @@ async def get_website_status(
         pages_failed = metrics.get("pages_failed", 0)
 
         # Determine status: use job status if in progress, otherwise use source status
-        if job.status.value in ["pending", "in_progress"]:
-            status = job.status.value
+        if job.status in ["pending", "in_progress"]:
+            status = job.status
             error = None
         else:
             status = index_source.status
@@ -294,7 +298,6 @@ async def get_website_by_id(
     website_id: str,
     db: AsyncSession = Depends(get_db),
     _: None = Depends(verify_token),
-    __: None = Depends(ask_ai_enabled),
 ) -> JSONResponse:
     """
     Get a single indexed website page by ID.
@@ -411,7 +414,7 @@ async def reindex_website(
         if index_source:
             # Update existing source
             index_source.status = "indexing"
-            index_source.last_job_id = job_id
+            index_source.job_id = job_id
             index_source.updated_at = datetime.now(UTC)
             # Reset metrics for reindexing
             index_source.metrics = {}
@@ -425,7 +428,7 @@ async def reindex_website(
                 source_identifier=body.base_url,
                 config={"base_url": body.base_url},
                 status="indexing",
-                last_job_id=job_id,
+                job_id=job_id,
                 created_at=datetime.now(UTC),
                 updated_at=datetime.now(UTC),
             )
@@ -436,7 +439,13 @@ async def reindex_website(
         # Start the crawling job
         asyncio.create_task(
             job_manager.execute_job(
-                job_id, _crawl_website_job, index_source.id, domain, IndexWebsiteRequest(base_url=body.base_url), db
+                job_id,
+                _crawl_website_job,
+                job_id,
+                index_source.id,
+                domain,
+                IndexWebsiteRequest(base_url=body.base_url),
+                db,
             )
         )