Skip to content

Commit 2dae729

Browse files
committed
Fixes
1 parent f3f1489 commit 2dae729

File tree

2 files changed

+84
-36
lines changed

2 files changed

+84
-36
lines changed

servers/fai/src/fai/models/api/website_api.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,21 @@
1010
class IndexWebsiteRequest(BaseModel):
1111
base_url: str = Field(description="The base URL to start indexing from (e.g., 'https://docs.example.com')")
1212
domain_filter: str | None = Field(
13-
default=None, description="Domain to filter the index (e.g., 'docs.example.com')"
13+
default=None, description="Domain to filter crawling (e.g., 'docs.example.com'). Defaults to base_url domain."
1414
)
15-
path_include_patterns: list[str] | None = Field(
16-
default=None, description="URL patterns to include (e.g., ['/docs/*', '/api/*']). If empty, includes all."
15+
path_filter: str | None = Field(
16+
default=None,
17+
description="Path prefix to restrict crawling (e.g., '/docs'). Only URLs starting with this will be crawled.",
1718
)
18-
path_exclude_patterns: list[str] | None = Field(
19-
default=None, description="URL patterns to exclude (e.g., ['/blog/*', '*.pdf'])"
19+
url_pattern: str | None = Field(
20+
default=None,
21+
description="Regex pattern to filter URLs (e.g., 'https://example\\.com/(docs|api)/.*').",
2022
)
23+
chunk_size: int | None = Field(default=1000, description="Size of text chunks for splitting documents")
24+
chunk_overlap: int | None = Field(default=200, description="Overlap between consecutive chunks")
25+
min_content_length: int | None = Field(default=100, description="Minimum content length to index a page")
26+
max_pages: int | None = Field(default=None, description="Maximum number of pages to crawl. None means unlimited.")
27+
delay: float | None = Field(default=1.0, description="Delay in seconds between requests")
2128
version: str | None = Field(default=None, description="Version to tag all indexed pages with")
2229
product: str | None = Field(default=None, description="Product to tag all indexed pages with")
2330
authed: bool | None = Field(default=None, description="Whether indexed pages should be auth-gated")

servers/fai/src/fai/routes/website.py

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ async def index_website(
7373
Returns a job_id to track the crawling progress.
7474
"""
7575
try:
76-
# Check if IndexSourceDb already exists for this base_url
7776
result = await db.execute(
7877
select(IndexSourceDb).where(
7978
IndexSourceDb.domain == domain,
@@ -83,17 +82,14 @@ async def index_website(
8382
)
8483
index_source = result.scalar_one_or_none()
8584

86-
# Create a job to track the crawling process
8785
job_id = await job_manager.create_job(db)
8886

8987
if index_source:
90-
# Update existing source
9188
index_source.status = "indexing"
9289
index_source.last_job_id = job_id
9390
index_source.config = body.model_dump()
9491
index_source.updated_at = datetime.now(UTC)
9592
else:
96-
# Create new source
9793
source_id = str(uuid.uuid4())
9894
index_source = IndexSourceDb(
9995
id=source_id,
@@ -129,43 +125,85 @@ async def _crawl_website_job(
129125
) -> None:
130126
"""
131127
Background job to crawl a website and index its pages.
132-
133-
TODO: Implement the actual crawling logic:
134-
1. Use a web crawler library
135-
2. Extract content from pages
136-
3. Chunk content if needed
137-
4. Create WebsiteSourceDb entries
138-
5. Sync to Turbopuffer
139-
6. Update job status
140128
"""
129+
pages_indexed = 0
130+
pages_failed = 0
131+
141132
try:
142-
# Placeholder for actual crawling logic
143-
# pages = await crawl_website(config.base_url, config.max_depth, ...)
133+
from fai.utils.website.crawler import DocumentationCrawler
134+
135+
crawler = DocumentationCrawler(
136+
start_url=config.base_url,
137+
domain_filter=config.domain_filter,
138+
path_filter=config.path_filter,
139+
url_pattern=config.url_pattern,
140+
chunk_size=config.chunk_size or 1000,
141+
chunk_overlap=config.chunk_overlap or 200,
142+
min_content_length=config.min_content_length or 100,
143+
)
144+
145+
loop = asyncio.get_event_loop()
146+
chunks = await loop.run_in_executor(
147+
None, lambda: crawler.crawl(max_pages=config.max_pages, delay=config.delay or 1.0, verbose=True)
148+
)
149+
150+
LOGGER.info(f"Crawled {len(chunks)} chunks from {config.base_url}")
151+
152+
for chunk in chunks:
153+
try:
154+
chunk_id = str(uuid.uuid4())
155+
156+
metadata = chunk.metadata
157+
page_url = metadata.get("url")
158+
document_title = metadata.get("document_title")
159+
160+
keywords_val = metadata.get("keywords")
161+
keywords = None
162+
if keywords_val and isinstance(keywords_val, str):
163+
keywords = [k.strip() for k in keywords_val.split(",") if k.strip()]
164+
165+
website_entry = WebsiteDb(
166+
id=chunk_id,
167+
domain=domain,
168+
base_url=config.base_url,
169+
page_url=str(page_url) if page_url else config.base_url,
170+
chunk=chunk.content,
171+
document=chunk.content,
172+
title=str(document_title) if document_title else None,
173+
version=config.version,
174+
product=config.product,
175+
keywords=keywords,
176+
authed=config.authed if config.authed is not None else False,
177+
created_at=datetime.now(UTC),
178+
updated_at=datetime.now(UTC),
179+
)
144180

145-
# For each page:
146-
# - Create WebsiteSourceDb row
147-
# - Add to database
181+
db.add(website_entry)
182+
pages_indexed += 1
148183

149-
# await db.commit()
150-
# await sync_website_db_to_tpuf(domain, db)
151-
# await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
184+
except Exception as e:
185+
LOGGER.error(f"Failed to create WebsiteDb entry for chunk: {e}")
186+
pages_failed += 1
187+
188+
pages_failed += len(crawler.failed_urls)
189+
190+
LOGGER.info(f"Created {pages_indexed} WebsiteDb entries, {pages_failed} failed")
191+
192+
await db.commit()
193+
await sync_website_db_to_tpuf(domain, db)
194+
await sync_index_to_target(domain, get_website_index_name(), get_query_index_name())
152195

153-
# Update IndexSourceDb on success
154196
result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
155197
index_source = result.scalar_one_or_none()
156198

157199
if index_source:
158200
index_source.status = "active"
159201
index_source.last_indexed_at = datetime.now(UTC)
160202
index_source.updated_at = datetime.now(UTC)
161-
index_source.last_error = None
162-
index_source.last_error_at = None
163203

164-
# Update metrics with crawl results
165-
# This should be populated with actual results from the crawling logic
166204
index_source.metrics = {
167-
"pages_indexed": 0, # TODO: Update with actual count
168-
"pages_failed": 0, # TODO: Update with actual count
205+
"pages_indexed": pages_indexed,
206+
"pages_failed": pages_failed,
169207
}
170208

171209
await db.commit()
@@ -174,15 +212,18 @@ async def _crawl_website_job(
174212
except Exception:
175213
LOGGER.exception(f"Failed to complete website crawl job {job_id}")
176214

177-
# Update IndexSourceDb on failure
178215
result = await db.execute(select(IndexSourceDb).where(IndexSourceDb.id == source_id))
179216
index_source = result.scalar_one_or_none()
180217

181218
if index_source:
182219
index_source.status = "failed"
183-
index_source.last_error = str(e)
184-
index_source.last_error_at = datetime.now(UTC)
185220
index_source.updated_at = datetime.now(UTC)
221+
222+
index_source.metrics = {
223+
"pages_indexed": pages_indexed,
224+
"pages_failed": pages_failed,
225+
}
226+
186227
await db.commit()
187228

188229

@@ -225,7 +266,7 @@ async def get_website_status(
225266
error = None
226267
else:
227268
status = index_source.status
228-
error = index_source.last_error or job.error
269+
error = job.error
229270

230271
response = GetWebsiteStatusResponse(
231272
job_id=job.id,

0 commit comments

Comments
 (0)