Skip to content

Commit 6d208ac

Browse files
tsbhanguclaude
andcommitted
Add website crawler infrastructure
- Add website crawling utilities (crawler, content extractor, chunker) - Add comprehensive test coverage for all crawling functionality - Add Python dependencies for web scraping (beautifulsoup4, lxml, html2text) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent f4db65e commit 6d208ac

File tree

12 files changed

+1870
-26
lines changed

12 files changed

+1870
-26
lines changed

servers/fai/poetry.lock

Lines changed: 79 additions & 25 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

servers/fai/pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ priority = "supplemental"
1616

1717
[tool.poetry.dependencies]
1818
python = ">=3.11,<4.0"
19-
fastapi = "^0.120.1"
19+
fastapi = "^0.116.2"
2020
pydantic = "^2.8.0"
2121
requests = "^2.31.0"
2222
types-requests = "^2.28.11"
@@ -45,6 +45,9 @@ slack-sdk = "^3.36.0"
4545
python-multipart = "^0.0.20"
4646
upstash-redis = "^1.4.0"
4747
aioboto3 = "^13.0.0"
48+
markdownify = "^1.2.0"
49+
tenacity = "^8.2.0"
50+
beautifulsoup4 = "^4.12.0"
4851

4952
[tool.poetry.scripts]
5053
start = "fai.main:start"
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from fai.utils.website.chunker import MarkdownChunker
2+
from fai.utils.website.crawler import DocumentationCrawler
3+
from fai.utils.website.extractor import ContentExtractor
4+
from fai.utils.website.models import DocumentChunk
5+
6+
__all__ = ["DocumentChunk", "ContentExtractor", "MarkdownChunker", "DocumentationCrawler"]
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import re
2+
3+
from fai.utils.website.models import DocumentChunk
4+
5+
6+
class MarkdownChunker:
7+
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, min_chunk_size: int = 100):
8+
self.chunk_size = chunk_size
9+
self.chunk_overlap = chunk_overlap
10+
self.min_chunk_size = min_chunk_size
11+
12+
def chunk_document(
13+
self, markdown_content: str, title: str, metadata: dict[str, str | list[str] | None]
14+
) -> list[DocumentChunk]:
15+
chunks: list[DocumentChunk] = []
16+
sections = self._split_by_headers(markdown_content)
17+
18+
for section in sections:
19+
section_chunks = self._chunk_section(section, title, metadata, markdown_content)
20+
chunks.extend(section_chunks)
21+
22+
return chunks
23+
24+
def _split_by_headers(self, markdown: str) -> list[dict[str, str | int | None]]:
25+
sections: list[dict[str, str | int | None]] = []
26+
lines = markdown.split("\n")
27+
28+
current_lines: list[str] = []
29+
current_heading: str | None = None
30+
current_level: int = 0
31+
32+
for line in lines:
33+
header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
34+
35+
if header_match:
36+
if current_lines:
37+
sections.append(
38+
{"heading": current_heading, "level": current_level, "content": "\n".join(current_lines)}
39+
)
40+
41+
current_level = len(header_match.group(1))
42+
current_heading = header_match.group(2).strip()
43+
current_lines = []
44+
else:
45+
current_lines.append(line)
46+
47+
if current_lines:
48+
sections.append({"heading": current_heading, "level": current_level, "content": "\n".join(current_lines)})
49+
50+
if not sections and markdown.strip():
51+
sections.append({"heading": None, "level": 0, "content": markdown})
52+
53+
return sections
54+
55+
def _chunk_section(
56+
self,
57+
section: dict[str, str | int | None],
58+
doc_title: str,
59+
base_metadata: dict[str, str | list[str] | None],
60+
full_document: str,
61+
) -> list[DocumentChunk]:
62+
chunks: list[DocumentChunk] = []
63+
heading_val = section["heading"]
64+
level_val = section["level"]
65+
content_val = section["content"]
66+
67+
heading: str | None = heading_val if isinstance(heading_val, str) or heading_val is None else None
68+
level: int = level_val if isinstance(level_val, int) else 0
69+
content: str = content_val.strip() if isinstance(content_val, str) else ""
70+
71+
if not content or len(content) < self.min_chunk_size:
72+
return chunks
73+
74+
if len(content) <= self.chunk_size:
75+
chunk_content = content
76+
77+
if heading:
78+
chunk_content = f"# {heading}\n\n{chunk_content}"
79+
80+
chunks.append(
81+
DocumentChunk(
82+
content=chunk_content,
83+
metadata={
84+
"document_title": doc_title,
85+
"section_heading": heading,
86+
"heading_level": level,
87+
"chunk_type": "section",
88+
**base_metadata,
89+
},
90+
full_document=full_document,
91+
)
92+
)
93+
else:
94+
text_chunks = self._split_with_overlap(content)
95+
96+
for i, chunk_text in enumerate(text_chunks):
97+
if len(chunk_text.strip()) >= self.min_chunk_size:
98+
if heading and i == 0:
99+
chunk_content = f"# {heading}\n\n{chunk_text}"
100+
elif heading:
101+
chunk_content = f"[Continuing from: {heading}]\n\n{chunk_text}"
102+
else:
103+
chunk_content = chunk_text
104+
105+
chunks.append(
106+
DocumentChunk(
107+
content=chunk_content,
108+
metadata={
109+
"document_title": doc_title,
110+
"section_heading": heading,
111+
"heading_level": level,
112+
"chunk_type": "section_part",
113+
"part_number": i + 1,
114+
"total_parts": len(text_chunks),
115+
**base_metadata,
116+
},
117+
full_document=full_document,
118+
)
119+
)
120+
121+
return chunks
122+
123+
def _split_with_overlap(self, text: str) -> list[str]:
124+
if len(text) <= self.chunk_size:
125+
return [text]
126+
127+
chunks: list[str] = []
128+
paragraphs = re.split(r"\n\n+", text)
129+
current_chunk: list[str] = []
130+
current_length = 0
131+
132+
for para in paragraphs:
133+
para_length = len(para)
134+
135+
if current_length + para_length > self.chunk_size and current_chunk:
136+
chunks.append("\n\n".join(current_chunk))
137+
138+
overlap_paras: list[str] = []
139+
overlap_length = 0
140+
141+
for p in reversed(current_chunk):
142+
if overlap_length + len(p) <= self.chunk_overlap:
143+
overlap_paras.insert(0, p)
144+
overlap_length += len(p)
145+
else:
146+
break
147+
148+
current_chunk = overlap_paras
149+
current_length = overlap_length
150+
151+
current_chunk.append(para)
152+
current_length += para_length
153+
154+
if current_chunk:
155+
chunks.append("\n\n".join(current_chunk))
156+
157+
return chunks

0 commit comments

Comments
 (0)