Skip to content

Commit 3373c0e

Browse files
author
Daniele Briggi
committed
feat(extractor): file specific extractor
Add extractor of md frontmatter for metadata
1 parent 1e24f17 commit 3373c0e

File tree

12 files changed

+242
-5
lines changed

12 files changed

+242
-5
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ share/python-wheels/
2929
.installed.cfg
3030
*.egg
3131
MANIFEST
32+
.venv/
33+
venv/
3234

3335
# PyInstaller
3436
*.manifest

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ dependencies = [
2525
"markitdown[pptx]",
2626
"markitdown[xls]",
2727
"markitdown[xlsx]",
28+
"python-frontmatter",
2829
"prompt-toolkit",
2930
"sqlite-ai",
3031
"sqliteai-vector"
@@ -34,7 +35,7 @@ dependencies = [
3435
dev = [
3536
"pytest",
3637
"pytest-mock",
37-
"pytest-cov==6.3.0",
38+
"pytest-cov",
3839
"black",
3940
"flake8",
4041
"bandit",

src/sqlite_rag/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def initialize(conn: sqlite3.Connection, settings: Settings) -> sqlite3.Connecti
2929
)
3030
)
3131
conn.load_extension(
32-
str(importlib.resources.files("sqlite-vector.binaries") / "vector")
32+
str(importlib.resources.files("sqlite_vector.binaries") / "vector")
3333
)
3434
except sqlite3.OperationalError as e:
3535
raise RuntimeError(

src/sqlite_rag/extractor.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from pathlib import Path
2+
from typing import Dict, Tuple
3+
4+
from sqlite_rag.extractors.base import MetadataExtractor
5+
from sqlite_rag.extractors.frontmatter import FrontmatterExtractor
6+
7+
8+
class Extractor:
9+
extractors = [
10+
FrontmatterExtractor(),
11+
]
12+
13+
def get_extractor(self, file_extension: str) -> MetadataExtractor | None:
14+
"""Get the appropriate extractor based on file type."""
15+
for extractor in self.extractors:
16+
if extractor.supports_file_type(file_extension):
17+
return extractor
18+
19+
return None
20+
21+
def extract_metadata(self, content: str, file_path: Path) -> Tuple[str, Dict]:
22+
"""Extract metadata and clean content based on file type.
23+
24+
Args:
25+
content: Raw content to extract metadata from
26+
file_path: Path to the file for context
27+
28+
Returns:
29+
Tuple of (clean_content, metadata_dict)
30+
"""
31+
file_extension = file_path.suffix
32+
33+
extractor = self.get_extractor(file_extension)
34+
if extractor:
35+
return extractor.extract(content, file_path)
36+
37+
return content, {}

src/sqlite_rag/extractors/__init__.py

Whitespace-only changes.

src/sqlite_rag/extractors/base.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from abc import ABC, abstractmethod
2+
from pathlib import Path
3+
from typing import Dict, Optional, Tuple
4+
5+
6+
class MetadataExtractor(ABC):
7+
"""Base interface for metadata extractors."""
8+
9+
@abstractmethod
10+
def extract(
11+
self, content: str, file_path: Optional[Path] = None
12+
) -> Tuple[str, Dict]:
13+
"""Extract metadata from content.
14+
15+
Args:
16+
content: The raw content to extract metadata from
17+
file_path: Optional file path for context
18+
19+
Returns:
20+
Tuple of (clean_content, metadata_dict)
21+
"""
22+
23+
@abstractmethod
24+
def supports_file_type(self, file_extension: str) -> bool:
25+
"""Check if this extractor supports the given file type.
26+
27+
Args:
28+
file_extension: File extension (e.g., '.md', '.pdf')
29+
30+
Returns:
31+
True if this extractor can handle the file type
32+
"""
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from pathlib import Path
2+
from typing import Dict, Optional, Tuple
3+
4+
import frontmatter
5+
6+
from sqlite_rag.extractors.base import MetadataExtractor
7+
8+
9+
class FrontmatterExtractor(MetadataExtractor):
10+
"""Extracts frontmatter from markdown files."""
11+
12+
def extract(
13+
self, content: str, file_path: Optional[Path] = None
14+
) -> Tuple[str, Dict]:
15+
"""Extract frontmatter from markdown content."""
16+
try:
17+
post = frontmatter.loads(content)
18+
clean_content = post.content
19+
metadata = dict(post.metadata)
20+
return clean_content, metadata
21+
except Exception:
22+
# If frontmatter parsing fails, return original content
23+
return content, {}
24+
25+
def supports_file_type(self, file_extension: str) -> bool:
26+
"""Support markdown files."""
27+
return file_extension.lower() in [".md", ".mdx", ".txt"]

src/sqlite_rag/reader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def is_supported(path: Path) -> bool:
4747

4848
@staticmethod
4949
def parse_file(path: Path, max_document_size_bytes: Optional[int] = None) -> str:
50+
"""Read the file and convert into Markdown text."""
5051
try:
5152
converter = MarkItDown()
5253
text = converter.convert(

src/sqlite_rag/sqliterag.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pathlib import Path
44
from typing import Any, Optional
55

6+
from sqlite_rag.extractor import Extractor
67
from sqlite_rag.logger import Logger
78
from sqlite_rag.models.document_result import DocumentResult
89

@@ -25,6 +26,7 @@ def __init__(self, connection: sqlite3.Connection, settings: Settings):
2526
self._repository = Repository(self._conn, settings)
2627
self._chunker = Chunker(self._conn, settings)
2728
self._engine = Engine(self._conn, settings, chunker=self._chunker)
29+
self._extractor = Extractor()
2830

2931
self.ready = False
3032

@@ -103,7 +105,16 @@ def add(
103105
if use_relative_paths
104106
else str(file_path.absolute())
105107
)
106-
document = Document(content=content, uri=uri, metadata=metadata.copy())
108+
109+
content, file_metadata = self._extractor.extract_metadata(
110+
content, file_path
111+
)
112+
113+
merged_metadata = metadata.copy()
114+
if file_metadata:
115+
merged_metadata["extracted"] = file_metadata
116+
117+
document = Document(content=content, uri=uri, metadata=merged_metadata)
107118

108119
exists = self._repository.document_exists_by_hash(document.hash())
109120
if exists:
@@ -187,7 +198,21 @@ def rebuild(self, remove_missing: bool = False) -> dict:
187198
content = FileReader.parse_file(
188199
Path(doc.uri), self._settings.max_document_size_bytes
189200
)
201+
202+
if not content:
203+
self._logger.warning(
204+
f"{i+1}/{total_docs} Skipping empty file: {doc.uri}"
205+
)
206+
not_found += 1
207+
continue
208+
209+
content, file_metadata = self._extractor.extract_metadata(
210+
content, Path(doc.uri)
211+
)
212+
190213
doc.content = content
214+
if file_metadata:
215+
doc.metadata["extracted"] = file_metadata
191216

192217
self._repository.remove_document(doc_id)
193218
processed_doc = self._engine.process(doc)
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from sqlite_rag.extractors.frontmatter import FrontmatterExtractor
2+
3+
4+
class TestFrontmatterExtractor:
5+
def test_extract_with_frontmatter(self):
6+
content = """---
7+
title: Test Document
8+
author: John Doe
9+
---
10+
# Heading 1
11+
This is a test document.
12+
"""
13+
extractor = FrontmatterExtractor()
14+
clean_content, metadata = extractor.extract(content)
15+
assert "title" in metadata
16+
assert metadata["title"] == "Test Document"
17+
assert "author" in metadata
18+
assert metadata["author"] == "John Doe"
19+
assert "# Heading 1" in clean_content
20+
assert "This is a test document." in clean_content
21+
22+
def test_extract_without_frontmatter(self):
23+
content = """# Heading 1
24+
This is a test document without frontmatter.
25+
"""
26+
extractor = FrontmatterExtractor()
27+
clean_content, metadata = extractor.extract(content)
28+
assert metadata == {}
29+
assert "# Heading 1" in clean_content
30+
assert "This is a test document without frontmatter." in clean_content
31+
32+
def test_supports_file_type(self):
33+
extractor = FrontmatterExtractor()
34+
assert extractor.supports_file_type(".md")
35+
assert extractor.supports_file_type(".MDX")
36+
assert extractor.supports_file_type(".txt")
37+
assert not extractor.supports_file_type(".pdf")
38+
assert not extractor.supports_file_type(".html")

0 commit comments

Comments
 (0)