Skip to content

Commit a7d6826

Browse files
author
Daniele Briggi
committed
feat(chunk): remove overlapping from snippet result
1 parent c3c53e3 commit a7d6826

File tree

6 files changed

+15
-33
lines changed

6 files changed

+15
-33
lines changed

src/sqlite_rag/chunker.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,16 @@ def _apply_overlap(self, chunks: List[Chunk]) -> List[Chunk]:
161161

162162
if overlap_text:
163163
combined_content = overlap_text + " " + current_content
164+
# Core content starts after overlap and separator
165+
core_start_pos = len(overlap_text) + 1
164166
else:
165167
combined_content = current_content
168+
# No overlap, core starts at beginning
169+
core_start_pos = 0
166170

167-
overlapped_chunks.append(Chunk(content=combined_content))
171+
overlapped_chunks.append(
172+
Chunk(content=combined_content, core_start_pos=core_start_pos)
173+
)
168174

169175
return overlapped_chunks
170176

src/sqlite_rag/database.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings):
7878
document_id TEXT,
7979
content TEXT,
8080
embedding BLOB,
81+
core_start_pos INTEGER DEFAULT 0,
8182
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
8283
);
8384
"""

src/sqlite_rag/engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ def search(self, query: str, limit: int = 10) -> list[DocumentResult]:
170170
documents.content as document_content,
171171
documents.metadata,
172172
chunks.content AS snippet,
173+
chunks.core_start_pos,
173174
vec_rank,
174175
fts_rank,
175176
combined_rank,
@@ -200,7 +201,8 @@ def search(self, query: str, limit: int = 10) -> list[DocumentResult]:
200201
content=row["document_content"],
201202
metadata=json.loads(row["metadata"]) if row["metadata"] else {},
202203
),
203-
snippet=row["snippet"],
204+
# remove overlapping text from the snippet
205+
snippet=row["snippet"][row["core_start_pos"] :],
204206
vec_rank=row["vec_rank"],
205207
fts_rank=row["fts_rank"],
206208
combined_rank=row["combined_rank"],

src/sqlite_rag/formatters.py

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def _clean_and_wrap_snippet(self, snippet: str, width: int = 75) -> List[str]:
180180

181181

182182
class TableDebugFormatter(SearchResultFormatter):
183-
"""Legacy debug formatter for backwards compatibility."""
183+
"""Table view debug formatter."""
184184

185185
def format_results(self, results: List[DocumentResult], query: str) -> None:
186186
if not results:
@@ -222,34 +222,6 @@ def format_results(self, results: List[DocumentResult], query: str) -> None:
222222
)
223223

224224

225-
class LegacyCompactFormatter(SearchResultFormatter):
226-
"""Legacy compact formatter for backwards compatibility."""
227-
228-
def format_results(self, results: List[DocumentResult], query: str) -> None:
229-
if not results:
230-
typer.echo("No documents found matching the query.")
231-
return
232-
233-
typer.echo(f"Found {len(results)} documents:")
234-
235-
# Clean simple table for normal view
236-
typer.echo(f"{'#':<3} {'Preview':<60} {'URI':<40}")
237-
typer.echo("─" * 105)
238-
239-
for idx, doc in enumerate(results, 1):
240-
# Clean snippet display
241-
snippet = doc.snippet.replace("\n", " ").replace("\r", "")
242-
if len(snippet) > 57:
243-
snippet = snippet[:54] + "..."
244-
245-
# Clean URI display
246-
uri = doc.document.uri or "N/A"
247-
if len(uri) > 37:
248-
uri = "..." + uri[-34:]
249-
250-
typer.echo(f"{idx:<3} {snippet:<60} {uri:<40}")
251-
252-
253225
def get_formatter(
254226
debug: bool = False, table_view: bool = False
255227
) -> SearchResultFormatter:

src/sqlite_rag/models/chunk.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ class Chunk:
77
document_id: int | None = None
88
content: str = ""
99
embedding: str | bytes = b""
10+
core_start_pos: int = 0

src/sqlite_rag/repository.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def add_document(self, document: Document) -> str:
3030
for chunk in document.chunks:
3131
# TODO: use the right vector_convert function based on the vector type
3232
cursor.execute(
33-
"INSERT INTO chunks (document_id, content, embedding) VALUES (?, ?, vector_as_f16(?))",
34-
(document_id, chunk.content, chunk.embedding),
33+
"INSERT INTO chunks (document_id, content, embedding, core_start_pos) VALUES (?, ?, ?, ?)",
34+
(document_id, chunk.content, chunk.embedding, chunk.core_start_pos),
3535
)
3636
cursor.execute(
3737
"INSERT INTO chunks_fts (rowid, content) VALUES (last_insert_rowid(), ?)",

0 commit comments

Comments
 (0)