Skip to content

Commit 19b72aa

Browse files
Tanvir Stsbhangu
authored andcommitted
Add website source routes
1 parent aec8df4 commit 19b72aa

File tree

9 files changed

+653
-1
lines changed

9 files changed

+653
-1
lines changed

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,6 @@
3737
},
3838
"[typescriptreact]": {
3939
"editor.defaultFormatter": "biomejs.biome"
40-
}
40+
},
41+
"typescript.tsserver.maxTsServerMemory": 4096
4142
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""create websites table
2+
3+
Revision ID: create_websites
4+
Revises: 461b2caaffc7
5+
Create Date: 2025-10-27 00:00:00.000000
6+
7+
"""
8+
9+
from typing import (
10+
Sequence,
11+
Union,
12+
)
13+
14+
import sqlalchemy as sa
15+
from alembic import op
16+
17+
# revision identifiers, used by Alembic.
18+
revision: str = "create_websites"
19+
down_revision: Union[str, Sequence[str], None] = "461b2caaffc7"
20+
branch_labels: Union[str, Sequence[str], None] = None
21+
depends_on: Union[str, Sequence[str], None] = None
22+
23+
24+
def upgrade() -> None:
25+
"""Upgrade schema."""
26+
# ### commands auto generated by Alembic - please adjust! ###
27+
op.create_table(
28+
"websites",
29+
sa.Column("id", sa.String(), nullable=False),
30+
sa.Column("domain", sa.String(), nullable=False),
31+
sa.Column("base_url", sa.String(), nullable=False),
32+
sa.Column("page_url", sa.String(), nullable=False),
33+
sa.Column("chunk", sa.String(), nullable=False),
34+
sa.Column("document", sa.String(), nullable=False),
35+
sa.Column("title", sa.String(), nullable=True),
36+
sa.Column("version", sa.String(), nullable=True),
37+
sa.Column("product", sa.String(), nullable=True),
38+
sa.Column("keywords", sa.ARRAY(sa.String()), nullable=True),
39+
sa.Column("authed", sa.Boolean(), nullable=True),
40+
sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
41+
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
42+
sa.PrimaryKeyConstraint("id"),
43+
)
44+
op.create_index("idx_websites_domain", "websites", ["domain"], unique=False)
45+
op.create_index("idx_websites_base_url", "websites", ["base_url"], unique=False)
46+
op.create_index(
47+
"idx_websites_domain_base_url", "websites", ["domain", "base_url"], unique=False
48+
)
49+
# ### end Alembic commands ###
50+
51+
52+
def downgrade() -> None:
53+
"""Downgrade schema."""
54+
# ### commands auto generated by Alembic - please adjust! ###
55+
op.drop_index("idx_websites_domain_base_url", table_name="websites")
56+
op.drop_index("idx_websites_base_url", table_name="websites")
57+
op.drop_index("idx_websites_domain", table_name="websites")
58+
op.drop_table("websites")
59+
# ### end Alembic commands ###
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from pydantic import (
2+
BaseModel,
3+
Field,
4+
)
5+
6+
from fai.models.api.commons.pagination import PaginationResponse
7+
from fai.models.types.website_types import Website
8+
9+
10+
class IndexWebsiteRequest(BaseModel):
11+
base_url: str = Field(description="The base URL to start crawling from (e.g., 'https://docs.example.com')")
12+
max_depth: int | None = Field(
13+
default=1, description="Maximum depth to crawl from base URL (1 = only pages linked from base URL)"
14+
)
15+
include_patterns: list[str] | None = Field(
16+
default=None, description="URL patterns to include (e.g., ['/docs/*', '/api/*']). If empty, includes all."
17+
)
18+
exclude_patterns: list[str] | None = Field(
19+
default=None, description="URL patterns to exclude (e.g., ['/blog/*', '*.pdf'])"
20+
)
21+
version: str | None = Field(default=None, description="Version to tag all crawled pages with")
22+
product: str | None = Field(default=None, description="Product to tag all crawled pages with")
23+
authed: bool | None = Field(default=None, description="Whether crawled pages should be auth-gated")
24+
25+
26+
class IndexWebsiteResponse(BaseModel):
27+
job_id: str = Field(description="ID to track the crawling job status")
28+
base_url: str = Field(description="The base URL being crawled")
29+
30+
31+
class GetWebsiteStatusResponse(BaseModel):
32+
job_id: str
33+
status: str = Field(description="Job status: PENDING, PROCESSING, COMPLETED, or FAILED")
34+
base_url: str
35+
pages_indexed: int = Field(description="Number of pages successfully indexed")
36+
pages_failed: int = Field(description="Number of pages that failed to index")
37+
error: str | None = Field(default=None, description="Error message if the job failed")
38+
39+
40+
class GetWebsiteResponse(BaseModel):
41+
website: Website = Field(description="The requested website")
42+
43+
44+
class GetWebsitesResponse(BaseModel):
45+
websites: list[Website] = Field(description="List of indexed website pages for the domain")
46+
pagination: PaginationResponse = Field(description="Pagination information for the website list")
47+
48+
49+
class ReindexWebsiteRequest(BaseModel):
50+
base_url: str = Field(description="The base URL to re-crawl (will delete old pages and re-index)")
51+
52+
53+
class ReindexWebsiteResponse(BaseModel):
54+
job_id: str = Field(description="ID to track the re-crawling job status")
55+
base_url: str = Field(description="The base URL being re-crawled")
56+
57+
58+
class DeleteWebsiteRequest(BaseModel):
59+
base_url: str = Field(description="The base URL of the website to delete (deletes all pages from this source)")
60+
61+
62+
class DeleteWebsiteResponse(BaseModel):
63+
success: bool = Field(description="Whether the website was successfully deleted")
64+
pages_deleted: int = Field(description="Number of pages deleted")
65+
66+
67+
class DeleteAllWebsitesResponse(BaseModel):
68+
success: bool = Field(description="Whether all websites were successfully deleted")
69+
pages_deleted: int = Field(description="Total number of pages deleted")
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from openai import AsyncOpenAI
2+
from sqlalchemy import (
3+
Boolean,
4+
Column,
5+
DateTime,
6+
Integer,
7+
String,
8+
)
9+
10+
from fai.db import Base
11+
from fai.models.db.utils.array_column import ArrayColumn
12+
from fai.models.types.website_types import Website
13+
from fai.models.utils.record import TurbopufferRecord
14+
from fai.settings import CONFIG
15+
16+
17+
class WebsiteDb(Base):
18+
__tablename__ = "websites"
19+
__table_args__ = {"extend_existing": True}
20+
21+
id = Column(String, primary_key=True)
22+
domain = Column(String, nullable=False)
23+
base_url = Column(String, nullable=False)
24+
page_url = Column(String, nullable=False)
25+
chunk = Column(String, nullable=False)
26+
document = Column(String, nullable=False)
27+
title = Column(String, nullable=True)
28+
version = Column(String, nullable=True)
29+
product = Column(String, nullable=True)
30+
keywords = Column(ArrayColumn(String), nullable=True)
31+
authed = Column(Boolean, nullable=True)
32+
created_at = Column(DateTime(timezone=True), nullable=False)
33+
updated_at = Column(DateTime(timezone=True), nullable=False)
34+
35+
def to_api(self) -> Website:
36+
return Website(
37+
website_id=self.id,
38+
domain=self.domain,
39+
base_url=self.base_url,
40+
page_url=self.page_url,
41+
chunk=self.chunk,
42+
document=self.document,
43+
title=self.title,
44+
version=self.version,
45+
product=self.product,
46+
keywords=self.keywords,
47+
authed=self.authed,
48+
created_at=self.created_at,
49+
updated_at=self.updated_at,
50+
)
51+
52+
async def to_tpuf_record(self, openai_client: AsyncOpenAI) -> TurbopufferRecord:
53+
embedding = await openai_client.embeddings.create(
54+
input=self.chunk,
55+
model=CONFIG.DEFAULT_EMBEDDING_MODEL.model_name,
56+
)
57+
chunk_vector = embedding.data[0].embedding
58+
return TurbopufferRecord(
59+
id=self.id,
60+
vector=chunk_vector,
61+
chunk=self.chunk,
62+
document=self.document,
63+
title=self.title or "",
64+
url=self.page_url or "",
65+
version=self.version,
66+
product=self.product,
67+
keywords=self.keywords,
68+
authed=self.authed,
69+
)

servers/fai/src/fai/models/enums/index_names.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ class DataIndexNames(Enum):
88
DOCUMENT = "document"
99
GUIDANCE = "guidance"
1010
SLACK_CONTEXT = "slack_context"
11+
WEBSITE = "website"
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from datetime import datetime
2+
3+
from pydantic import BaseModel
4+
5+
6+
class Website(BaseModel):
7+
website_id: str
8+
domain: str
9+
base_url: str
10+
page_url: str
11+
chunk: str
12+
document: str
13+
title: str | None = None
14+
version: str | None = None
15+
product: str | None = None
16+
keywords: list[str] | None = None
17+
authed: bool | None = None
18+
created_at: datetime
19+
updated_at: datetime

0 commit comments

Comments
 (0)