Skip to content

Commit a59b8fa

Browse files
tsbhanguvercel[bot]
andcommitted
Tanvir/add website crawler (#4530)
Co-authored-by: vercel[bot] <35613825+vercel[bot]@users.noreply.github.com>
1 parent f4db65e commit a59b8fa

File tree

12 files changed

+1861
-25
lines changed

12 files changed

+1861
-25
lines changed

servers/fai/poetry.lock

Lines changed: 79 additions & 25 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

servers/fai/pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ slack-sdk = "^3.36.0"
4545
python-multipart = "^0.0.20"
4646
upstash-redis = "^1.4.0"
4747
aioboto3 = "^13.0.0"
48+
markdownify = "^1.2.0"
49+
tenacity = "^8.2.0"
50+
beautifulsoup4 = "^4.12.0"
4851

4952
[tool.poetry.scripts]
5053
start = "fai.main:start"
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from fai.utils.website.chunker import MarkdownChunker
2+
from fai.utils.website.crawler import DocumentationCrawler
3+
from fai.utils.website.extractor import ContentExtractor
4+
from fai.utils.website.models import DocumentChunk
5+
6+
__all__ = ["DocumentChunk", "ContentExtractor", "MarkdownChunker", "DocumentationCrawler"]
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import re
2+
3+
from fai.utils.website.models import DocumentChunk
4+
5+
6+
class MarkdownChunker:
7+
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, min_chunk_size: int = 100):
8+
self.chunk_size = chunk_size
9+
self.chunk_overlap = chunk_overlap
10+
self.min_chunk_size = min_chunk_size
11+
12+
def chunk_document(
13+
self, markdown_content: str, title: str, metadata: dict[str, str | list[str] | None]
14+
) -> list[DocumentChunk]:
15+
chunks: list[DocumentChunk] = []
16+
sections = self._split_by_headers(markdown_content)
17+
18+
for section in sections:
19+
section_chunks = self._chunk_section(section, title, metadata)
20+
chunks.extend(section_chunks)
21+
22+
return chunks
23+
24+
def _split_by_headers(self, markdown: str) -> list[dict[str, str | int | None]]:
25+
sections: list[dict[str, str | int | None]] = []
26+
lines = markdown.split("\n")
27+
28+
current_lines: list[str] = []
29+
current_heading: str | None = None
30+
current_level: int = 0
31+
32+
for line in lines:
33+
header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
34+
35+
if header_match:
36+
if current_lines:
37+
sections.append(
38+
{"heading": current_heading, "level": current_level, "content": "\n".join(current_lines)}
39+
)
40+
41+
current_level = len(header_match.group(1))
42+
current_heading = header_match.group(2).strip()
43+
current_lines = []
44+
else:
45+
current_lines.append(line)
46+
47+
if current_lines:
48+
sections.append({"heading": current_heading, "level": current_level, "content": "\n".join(current_lines)})
49+
50+
if not sections and markdown.strip():
51+
sections.append({"heading": None, "level": 0, "content": markdown})
52+
53+
return sections
54+
55+
def _chunk_section(
56+
self, section: dict[str, str | int | None], doc_title: str, base_metadata: dict[str, str | list[str] | None]
57+
) -> list[DocumentChunk]:
58+
chunks: list[DocumentChunk] = []
59+
heading_val = section["heading"]
60+
level_val = section["level"]
61+
content_val = section["content"]
62+
63+
heading: str | None = heading_val if isinstance(heading_val, str) or heading_val is None else None
64+
level: int = level_val if isinstance(level_val, int) else 0
65+
content: str = content_val.strip() if isinstance(content_val, str) else ""
66+
67+
if not content or len(content) < self.min_chunk_size:
68+
return chunks
69+
70+
if len(content) <= self.chunk_size:
71+
chunk_content = content
72+
73+
if heading:
74+
chunk_content = f"# {heading}\n\n{chunk_content}"
75+
76+
chunks.append(
77+
DocumentChunk(
78+
content=chunk_content,
79+
metadata={
80+
"document_title": doc_title,
81+
"section_heading": heading,
82+
"heading_level": level,
83+
"chunk_type": "section",
84+
**base_metadata,
85+
},
86+
)
87+
)
88+
else:
89+
text_chunks = self._split_with_overlap(content)
90+
91+
for i, chunk_text in enumerate(text_chunks):
92+
if len(chunk_text.strip()) >= self.min_chunk_size:
93+
if heading and i == 0:
94+
chunk_content = f"# {heading}\n\n{chunk_text}"
95+
elif heading:
96+
chunk_content = f"[Continuing from: {heading}]\n\n{chunk_text}"
97+
else:
98+
chunk_content = chunk_text
99+
100+
chunks.append(
101+
DocumentChunk(
102+
content=chunk_content,
103+
metadata={
104+
"document_title": doc_title,
105+
"section_heading": heading,
106+
"heading_level": level,
107+
"chunk_type": "section_part",
108+
"part_number": i + 1,
109+
"total_parts": len(text_chunks),
110+
**base_metadata,
111+
},
112+
)
113+
)
114+
115+
return chunks
116+
117+
def _split_with_overlap(self, text: str) -> list[str]:
118+
if len(text) <= self.chunk_size:
119+
return [text]
120+
121+
chunks: list[str] = []
122+
paragraphs = re.split(r"\n\n+", text)
123+
current_chunk: list[str] = []
124+
current_length = 0
125+
126+
for para in paragraphs:
127+
para_length = len(para)
128+
129+
if current_length + para_length > self.chunk_size and current_chunk:
130+
chunks.append("\n\n".join(current_chunk))
131+
132+
overlap_paras: list[str] = []
133+
overlap_length = 0
134+
135+
for p in reversed(current_chunk):
136+
if overlap_length + len(p) <= self.chunk_overlap:
137+
overlap_paras.insert(0, p)
138+
overlap_length += len(p)
139+
else:
140+
break
141+
142+
current_chunk = overlap_paras
143+
current_length = overlap_length
144+
145+
current_chunk.append(para)
146+
current_length += para_length
147+
148+
if current_chunk:
149+
chunks.append("\n\n".join(current_chunk))
150+
151+
return chunks

0 commit comments

Comments
 (0)