Skip to content

Commit 2b9f033

Browse files
author
Daniele Briggi
committed
feat(metadata): extract title from markdown
1 parent 43cfe1a commit 2b9f033

File tree

3 files changed

+55
-0
lines changed

3 files changed

+55
-0
lines changed

src/sqlite_rag/engine.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,24 @@ def versions(self) -> dict:
215215
"vector_version": row["vector_version"],
216216
}
217217

218+
def extract_document_title(
219+
self, text: str, fallback_first_line: bool = False
220+
) -> str | None:
221+
"""Extract title from markdown content."""
222+
# Look for first level-1 heading
223+
match = re.search(r"^# (.+)$", text, re.MULTILINE)
224+
if match:
225+
return match.group(1).strip()
226+
227+
# Fallback: first non-empty line
228+
if fallback_first_line:
229+
for line in text.splitlines():
230+
line = line.strip()
231+
if line:
232+
return line
233+
234+
return None
235+
218236
def close(self):
219237
"""Close the database connection."""
220238
try:

src/sqlite_rag/sqliterag.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,11 @@ def add(
102102
else str(file_path.absolute())
103103
)
104104
document = Document(content=content, uri=uri, metadata=metadata)
105+
document.metadata["generated"]["title"] = (
106+
self._engine.extract_document_title(
107+
document.content, fallback_first_line=True
108+
)
109+
)
105110

106111
exists = self._repository.document_exists_by_hash(document.hash())
107112
if exists:

tests/test_engine.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
import pytest
2+
13
from sqlite_rag.chunker import Chunker
24
from sqlite_rag.engine import Engine
35
from sqlite_rag.models.chunk import Chunk
46
from sqlite_rag.models.document import Document
57
from sqlite_rag.repository import Repository
8+
from sqlite_rag.settings import Settings
69

710

811
class TestEngine:
@@ -192,3 +195,32 @@ def test_search_exact_match(self, db_conn):
192195
assert len(results) > 0
193196
assert doc1_id == results[0].document.id
194197
assert 0.0 == results[0].vec_distance
198+
199+
def test_extract_document_title(self):
200+
text = """# This is the Title
201+
This is the content of the document.
202+
It has multiple lines.
203+
"""
204+
205+
engine = Engine(None, Settings(), None) # type: ignore
206+
207+
title = engine.extract_document_title(text)
208+
assert title == "This is the Title"
209+
210+
@pytest.mark.parametrize(
211+
"fallback, expected_title",
212+
[
213+
(True, "This is the first line of the document without a title."),
214+
(False, None),
215+
],
216+
)
217+
def test_extract_document_title_from_first_line(self, fallback, expected_title):
218+
text = """
219+
This is the first line of the document without a title.
220+
It has multiple lines.
221+
"""
222+
223+
engine = Engine(None, Settings(), None) # type: ignore
224+
225+
title = engine.extract_document_title(text, fallback)
226+
assert title == expected_title

0 commit comments

Comments
 (0)