fern-api
diff --git a/‎servers/fai/poetry.lock‎
Lines changed: 79 additions & 25 deletions b/‎servers/fai/poetry.lock‎
Lines changed: 79 additions & 25 deletions
diff --git a/‎servers/fai/pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎servers/fai/pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎servers/fai/src/fai/utils/website/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎servers/fai/src/fai/utils/website/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎servers/fai/src/fai/utils/website/chunker.py‎
Lines changed: 151 additions & 0 deletions b/‎servers/fai/src/fai/utils/website/chunker.py‎
Lines changed: 151 additions & 0 deletions
@@ -45,6 +45,9 @@ slack-sdk = "^3.36.0"
 python-multipart = "^0.0.20"
 upstash-redis = "^1.4.0"
 aioboto3 = "^13.0.0"
+markdownify = "^1.2.0"
+tenacity = "^8.2.0"
+beautifulsoup4 = "^4.12.0"
 
 [tool.poetry.scripts]
 start = "fai.main:start"
 
@@ -0,0 +1,6 @@
+from fai.utils.website.chunker import MarkdownChunker
+from fai.utils.website.crawler import DocumentationCrawler
+from fai.utils.website.extractor import ContentExtractor
+from fai.utils.website.models import DocumentChunk
+
+__all__ = ["DocumentChunk", "ContentExtractor", "MarkdownChunker", "DocumentationCrawler"]
@@ -0,0 +1,151 @@
+import re
+
+from fai.utils.website.models import DocumentChunk
+
+
+class MarkdownChunker:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, min_chunk_size: int = 100):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.min_chunk_size = min_chunk_size
+
+    def chunk_document(
+        self, markdown_content: str, title: str, metadata: dict[str, str | list[str] | None]
+    ) -> list[DocumentChunk]:
+        chunks: list[DocumentChunk] = []
+        sections = self._split_by_headers(markdown_content)
+
+        for section in sections:
+            section_chunks = self._chunk_section(section, title, metadata)
+            chunks.extend(section_chunks)
+
+        return chunks
+
+    def _split_by_headers(self, markdown: str) -> list[dict[str, str | int | None]]:
+        sections: list[dict[str, str | int | None]] = []
+        lines = markdown.split("\n")
+
+        current_lines: list[str] = []
+        current_heading: str | None = None
+        current_level: int = 0
+
+        for line in lines:
+            header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
+
+            if header_match:
+                if current_lines:
+                    sections.append(
+                        {"heading": current_heading, "level": current_level, "content": "\n".join(current_lines)}
+                    )
+
+                current_level = len(header_match.group(1))
+                current_heading = header_match.group(2).strip()
+                current_lines = []
+            else:
+                current_lines.append(line)
+
+        if current_lines:
+            sections.append({"heading": current_heading, "level": current_level, "content": "\n".join(current_lines)})
+
+        if not sections and markdown.strip():
+            sections.append({"heading": None, "level": 0, "content": markdown})
+
+        return sections
+
+    def _chunk_section(
+        self, section: dict[str, str | int | None], doc_title: str, base_metadata: dict[str, str | list[str] | None]
+    ) -> list[DocumentChunk]:
+        chunks: list[DocumentChunk] = []
+        heading_val = section["heading"]
+        level_val = section["level"]
+        content_val = section["content"]
+
+        heading: str | None = heading_val if isinstance(heading_val, str) or heading_val is None else None
+        level: int = level_val if isinstance(level_val, int) else 0
+        content: str = content_val.strip() if isinstance(content_val, str) else ""
+
+        if not content or len(content) < self.min_chunk_size:
+            return chunks
+
+        if len(content) <= self.chunk_size:
+            chunk_content = content
+
+            if heading:
+                chunk_content = f"# {heading}\n\n{chunk_content}"
+
+            chunks.append(
+                DocumentChunk(
+                    content=chunk_content,
+                    metadata={
+                        "document_title": doc_title,
+                        "section_heading": heading,
+                        "heading_level": level,
+                        "chunk_type": "section",
+                        **base_metadata,
+                    },
+                )
+            )
+        else:
+            text_chunks = self._split_with_overlap(content)
+
+            for i, chunk_text in enumerate(text_chunks):
+                if len(chunk_text.strip()) >= self.min_chunk_size:
+                    if heading and i == 0:
+                        chunk_content = f"# {heading}\n\n{chunk_text}"
+                    elif heading:
+                        chunk_content = f"[Continuing from: {heading}]\n\n{chunk_text}"
+                    else:
+                        chunk_content = chunk_text
+
+                    chunks.append(
+                        DocumentChunk(
+                            content=chunk_content,
+                            metadata={
+                                "document_title": doc_title,
+                                "section_heading": heading,
+                                "heading_level": level,
+                                "chunk_type": "section_part",
+                                "part_number": i + 1,
+                                "total_parts": len(text_chunks),
+                                **base_metadata,
+                            },
+                        )
+                    )
+
+        return chunks
+
+    def _split_with_overlap(self, text: str) -> list[str]:
+        if len(text) <= self.chunk_size:
+            return [text]
+
+        chunks: list[str] = []
+        paragraphs = re.split(r"\n\n+", text)
+        current_chunk: list[str] = []
+        current_length = 0
+
+        for para in paragraphs:
+            para_length = len(para)
+
+            if current_length + para_length > self.chunk_size and current_chunk:
+                chunks.append("\n\n".join(current_chunk))
+
+                overlap_paras: list[str] = []
+                overlap_length = 0
+
+                for p in reversed(current_chunk):
+                    if overlap_length + len(p) <= self.chunk_overlap:
+                        overlap_paras.insert(0, p)
+                        overlap_length += len(p)
+                    else:
+                        break
+
+                current_chunk = overlap_paras
+                current_length = overlap_length
+
+            current_chunk.append(para)
+            current_length += para_length
+
+        if current_chunk:
+            chunks.append("\n\n".join(current_chunk))
+
+        return chunks