@@ -73,7 +73,6 @@ async def index_website(
7373 Returns a job_id to track the crawling progress.
7474 """
7575 try :
76- # Check if IndexSourceDb already exists for this base_url
7776 result = await db .execute (
7877 select (IndexSourceDb ).where (
7978 IndexSourceDb .domain == domain ,
@@ -83,17 +82,14 @@ async def index_website(
8382 )
8483 index_source = result .scalar_one_or_none ()
8584
86- # Create a job to track the crawling process
8785 job_id = await job_manager .create_job (db )
8886
8987 if index_source :
90- # Update existing source
9188 index_source .status = "indexing"
9289 index_source .last_job_id = job_id
9390 index_source .config = body .model_dump ()
9491 index_source .updated_at = datetime .now (UTC )
9592 else :
96- # Create new source
9793 source_id = str (uuid .uuid4 ())
9894 index_source = IndexSourceDb (
9995 id = source_id ,
@@ -129,43 +125,85 @@ async def _crawl_website_job(
129125) -> None :
130126 """
131127 Background job to crawl a website and index its pages.
132-
133- TODO: Implement the actual crawling logic:
134- 1. Use a web crawler library
135- 2. Extract content from pages
136- 3. Chunk content if needed
137- 4. Create WebsiteSourceDb entries
138- 5. Sync to Turbopuffer
139- 6. Update job status
140128 """
129+ pages_indexed = 0
130+ pages_failed = 0
131+
141132 try :
142- # Placeholder for actual crawling logic
143- # pages = await crawl_website(config.base_url, config.max_depth, ...)
133+ from fai .utils .website .crawler import DocumentationCrawler
134+
135+ crawler = DocumentationCrawler (
136+ start_url = config .base_url ,
137+ domain_filter = config .domain_filter ,
138+ path_filter = config .path_filter ,
139+ url_pattern = config .url_pattern ,
140+ chunk_size = config .chunk_size or 1000 ,
141+ chunk_overlap = config .chunk_overlap or 200 ,
142+ min_content_length = config .min_content_length or 100 ,
143+ )
144+
145+ loop = asyncio .get_event_loop ()
146+ chunks = await loop .run_in_executor (
147+ None , lambda : crawler .crawl (max_pages = config .max_pages , delay = config .delay or 1.0 , verbose = True )
148+ )
149+
150+ LOGGER .info (f"Crawled { len (chunks )} chunks from { config .base_url } " )
151+
152+ for chunk in chunks :
153+ try :
154+ chunk_id = str (uuid .uuid4 ())
155+
156+ metadata = chunk .metadata
157+ page_url = metadata .get ("url" )
158+ document_title = metadata .get ("document_title" )
159+
160+ keywords_val = metadata .get ("keywords" )
161+ keywords = None
162+ if keywords_val and isinstance (keywords_val , str ):
163+ keywords = [k .strip () for k in keywords_val .split ("," ) if k .strip ()]
164+
165+ website_entry = WebsiteDb (
166+ id = chunk_id ,
167+ domain = domain ,
168+ base_url = config .base_url ,
169+ page_url = str (page_url ) if page_url else config .base_url ,
170+ chunk = chunk .content ,
171+ document = chunk .content ,
172+ title = str (document_title ) if document_title else None ,
173+ version = config .version ,
174+ product = config .product ,
175+ keywords = keywords ,
176+ authed = config .authed if config .authed is not None else False ,
177+ created_at = datetime .now (UTC ),
178+ updated_at = datetime .now (UTC ),
179+ )
144180
145- # For each page:
146- # - Create WebsiteSourceDb row
147- # - Add to database
181+ db .add (website_entry )
182+ pages_indexed += 1
148183
149- # await db.commit()
150- # await sync_website_db_to_tpuf(domain, db)
151- # await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
184+ except Exception as e :
185+ LOGGER .error (f"Failed to create WebsiteDb entry for chunk: { e } " )
186+ pages_failed += 1
187+
188+ pages_failed += len (crawler .failed_urls )
189+
190+ LOGGER .info (f"Created { pages_indexed } WebsiteDb entries, { pages_failed } failed" )
191+
192+ await db .commit ()
193+ await sync_website_db_to_tpuf (domain , db )
194+ await sync_index_to_target (domain , get_website_index_name (), get_query_index_name ())
152195
153- # Update IndexSourceDb on success
154196 result = await db .execute (select (IndexSourceDb ).where (IndexSourceDb .id == source_id ))
155197 index_source = result .scalar_one_or_none ()
156198
157199 if index_source :
158200 index_source .status = "active"
159201 index_source .last_indexed_at = datetime .now (UTC )
160202 index_source .updated_at = datetime .now (UTC )
161- index_source .last_error = None
162- index_source .last_error_at = None
163203
164- # Update metrics with crawl results
165- # This should be populated with actual results from the crawling logic
166204 index_source .metrics = {
167- "pages_indexed" : 0 , # TODO: Update with actual count
168- "pages_failed" : 0 , # TODO: Update with actual count
205+ "pages_indexed" : pages_indexed ,
206+ "pages_failed" : pages_failed ,
169207 }
170208
171209 await db .commit ()
@@ -174,15 +212,18 @@ async def _crawl_website_job(
174212 except Exception :
175213 LOGGER .exception (f"Failed to complete website crawl job { job_id } " )
176214
177- # Update IndexSourceDb on failure
178215 result = await db .execute (select (IndexSourceDb ).where (IndexSourceDb .id == source_id ))
179216 index_source = result .scalar_one_or_none ()
180217
181218 if index_source :
182219 index_source .status = "failed"
183- index_source .last_error = str (e )
184- index_source .last_error_at = datetime .now (UTC )
185220 index_source .updated_at = datetime .now (UTC )
221+
222+ index_source .metrics = {
223+ "pages_indexed" : pages_indexed ,
224+ "pages_failed" : pages_failed ,
225+ }
226+
186227 await db .commit ()
187228
188229
@@ -225,7 +266,7 @@ async def get_website_status(
225266 error = None
226267 else :
227268 status = index_source .status
228- error = index_source . last_error or job .error
269+ error = job .error
229270
230271 response = GetWebsiteStatusResponse (
231272 job_id = job .id ,
0 commit comments