Skip to content

Commit 99d3107

Browse files
Merge branch 'main' into jd/ci-workflow-split
2 parents 4c0deb0 + aac48f8 commit 99d3107

File tree

18 files changed

+135
-108
lines changed

18 files changed

+135
-108
lines changed
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# Document Search
22

3-
::: ragbits.document_search.DocumentSearch
3+
::: ragbits.document_search.DocumentSearchOptions
4+
5+
::: ragbits.document_search.DocumentSearch

examples/document-search/chroma.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from ragbits.core.embeddings.dense import LiteLLMEmbedder, LiteLLMEmbedderOptions
4040
from ragbits.core.vector_stores.base import VectorStoreOptions
4141
from ragbits.core.vector_stores.chroma import ChromaVectorStore
42-
from ragbits.document_search import DocumentSearch, SearchConfig
42+
from ragbits.document_search import DocumentSearch, DocumentSearchOptions
4343
from ragbits.document_search.documents.document import DocumentMeta
4444

4545
set_trace_handlers("cli")
@@ -101,14 +101,12 @@ async def main() -> None:
101101
print([doc.metadata["content"] for doc in all_documents])
102102

103103
query = "I'm boiling my water and I need a joke"
104-
vector_store_kwargs = {
105-
"k": 2,
106-
"score_threshold": 0.4,
107-
}
108-
results = await document_search.search(
109-
query,
110-
config=SearchConfig(vector_store_kwargs=vector_store_kwargs),
104+
vector_store_options = VectorStoreOptions(
105+
k=2,
106+
score_threshold=0.4,
111107
)
108+
options = DocumentSearchOptions(vector_store_options=vector_store_options)
109+
results = await document_search.search(query, options)
112110

113111
print()
114112
print(f"Documents similar to: {query}")

examples/document-search/qdrant.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@
3737

3838
from ragbits.core.audit import set_trace_handlers
3939
from ragbits.core.embeddings.dense import LiteLLMEmbedder
40+
from ragbits.core.vector_stores.base import VectorStoreOptions
4041
from ragbits.core.vector_stores.qdrant import QdrantVectorStore
41-
from ragbits.document_search import DocumentSearch, SearchConfig
42+
from ragbits.document_search import DocumentSearch, DocumentSearchOptions
4243
from ragbits.document_search.documents.document import DocumentMeta
4344

4445
set_trace_handlers("cli")
@@ -92,14 +93,12 @@ async def main() -> None:
9293
print([doc.metadata["content"] for doc in all_documents])
9394

9495
query = "I'm boiling my water and I need a joke"
95-
vector_store_kwargs = {
96-
"k": 2,
97-
"score_threshold": 0.6,
98-
}
99-
results = await document_search.search(
100-
query,
101-
config=SearchConfig(vector_store_kwargs=vector_store_kwargs),
96+
vector_store_options = VectorStoreOptions(
97+
k=2,
98+
score_threshold=0.6,
10299
)
100+
options = DocumentSearchOptions(vector_store_options=vector_store_options)
101+
results = await document_search.search(query, options=options)
103102

104103
print()
105104
print(f"Documents similar to: {query}")

packages/ragbits-core/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
## Unreleased
44

5+
- Adjust typing for DocumentSearch (#554)
56
- Add Prometheus & Grafana Monitoring for LLMs Using OpenTelemetry (#427)
67
- Restructure audit module (#427)
78

packages/ragbits-core/tests/integration/vector_stores/test_vector_store.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ async def test_handling_document_ingestion_with_different_content_and_verifying_
233233
document_1 = DocumentMeta.create_text_document_from_literal(document_1_content)
234234
document_2 = DocumentMeta.create_text_document_from_literal(document_2_content)
235235

236-
document_search = DocumentSearch(
236+
document_search: DocumentSearch = DocumentSearch(
237237
vector_store=text_vector_store,
238238
)
239239
await document_search.ingest([document_1, document_2])

packages/ragbits-document-search/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
## Unreleased
44

55
- Update audit imports (#427)
6+
- BREAKING CHANGE: Adjust document search configurable interface (#554)
7+
- BREAKING CHANGE: Rename SearchConfig to DocumentSearchOptions (#554)
8+
- BREAKING CHANGE: Improve typing for SearchConfig (#554)
69

710
## 0.17.1 (2025-05-09)
811

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from ._main import DocumentSearch, SearchConfig
1+
from ragbits.document_search._main import DocumentSearch, DocumentSearchOptions
22

3-
__all__ = ["DocumentSearch", "SearchConfig"]
3+
__all__ = ["DocumentSearch", "DocumentSearchOptions"]

packages/ragbits-document-search/src/ragbits/document_search/__version__.py

Lines changed: 0 additions & 3 deletions
This file was deleted.

packages/ragbits-document-search/src/ragbits/document_search/_main.py

Lines changed: 70 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,52 @@
11
from collections.abc import Iterable, Sequence
22
from pathlib import Path
33
from types import ModuleType
4-
from typing import Any, ClassVar
4+
from typing import ClassVar, Generic
55

6-
from pydantic import BaseModel, Field
6+
from pydantic import BaseModel
77
from typing_extensions import Self
88

99
from ragbits import document_search
1010
from ragbits.core.audit.traces import trace, traceable
1111
from ragbits.core.config import CoreConfig
12+
from ragbits.core.options import Options
1213
from ragbits.core.sources.base import Source, SourceResolver
14+
from ragbits.core.types import NOT_GIVEN, NotGiven
1315
from ragbits.core.utils._pyproject import get_config_from_yaml
1416
from ragbits.core.utils.config_handling import (
17+
ConfigurableComponent,
1518
NoPreferredConfigError,
1619
ObjectConstructionConfig,
17-
WithConstructionConfig,
1820
)
19-
from ragbits.core.vector_stores import VectorStore
20-
from ragbits.core.vector_stores.base import VectorStoreOptions
21+
from ragbits.core.vector_stores.base import VectorStore, VectorStoreOptionsT
2122
from ragbits.document_search.documents.document import Document, DocumentMeta
2223
from ragbits.document_search.documents.element import Element
2324
from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
2425
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
25-
from ragbits.document_search.ingestion.strategies import (
26-
IngestStrategy,
27-
SequentialIngestStrategy,
28-
)
26+
from ragbits.document_search.ingestion.strategies import IngestStrategy, SequentialIngestStrategy
2927
from ragbits.document_search.ingestion.strategies.base import IngestExecutionError, IngestExecutionResult
3028
from ragbits.document_search.retrieval.rephrasers.base import QueryRephraser
3129
from ragbits.document_search.retrieval.rephrasers.noop import NoopQueryRephraser
32-
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptions
30+
from ragbits.document_search.retrieval.rerankers.base import Reranker, RerankerOptionsT
3331
from ragbits.document_search.retrieval.rerankers.noop import NoopReranker
3432

3533

36-
class SearchConfig(BaseModel):
34+
class DocumentSearchOptions(Options, Generic[VectorStoreOptionsT, RerankerOptionsT]):
3735
"""
38-
Configuration for the search process.
36+
Object representing the options for the document search.
37+
38+
Attributes:
39+
vector_store_options: The options for the vector store.
40+
reranker_options: The options for the reranker.
3941
"""
4042

41-
reranker_kwargs: dict[str, Any] = Field(default_factory=dict)
42-
vector_store_kwargs: dict[str, Any] = Field(default_factory=dict)
43-
embedder_kwargs: dict[str, Any] = Field(default_factory=dict)
43+
vector_store_options: VectorStoreOptionsT | None | NotGiven = NOT_GIVEN
44+
reranker_options: RerankerOptionsT | None | NotGiven = NOT_GIVEN
4445

4546

4647
class DocumentSearchConfig(BaseModel):
4748
"""
48-
Schema for the dict taken by DocumentSearch.from_config method.
49+
Schema for the document search config.
4950
"""
5051

5152
vector_store: ObjectConstructionConfig
@@ -56,39 +57,49 @@ class DocumentSearchConfig(BaseModel):
5657
enricher_router: dict[str, ObjectConstructionConfig] = {}
5758

5859

59-
class DocumentSearch(WithConstructionConfig):
60+
class DocumentSearch(ConfigurableComponent[DocumentSearchOptions[VectorStoreOptionsT, RerankerOptionsT]]):
6061
"""
61-
A main entrypoint to the DocumentSearch functionality.
62-
63-
It provides methods for both ingestion and retrieval.
62+
Main entrypoint to the document search functionality. It provides methods for document retrieval and ingestion.
6463
6564
Retrieval:
66-
6765
1. Uses QueryRephraser to rephrase the query.
68-
2. Uses VectorStore to retrieve the most relevant chunks.
69-
3. Uses Reranker to rerank the chunks.
66+
2. Uses VectorStore to retrieve the most relevant elements.
67+
3. Uses Reranker to rerank the elements.
68+
69+
Ingestion:
70+
1. Uses IngestStrategy to orchestrate ingestion process.
71+
2. Uses DocumentParserRouter to route the document to the appropriate DocumentParser to parse the content.
72+
3. Uses ElementEnricherRouter to redirect the element to the appropriate ElementEnricher to enrich the element.
7073
"""
7174

75+
options_cls: type[DocumentSearchOptions] = DocumentSearchOptions
7276
default_module: ClassVar[ModuleType | None] = document_search
7377
configuration_key: ClassVar[str] = "document_search"
7478

75-
vector_store: VectorStore
76-
query_rephraser: QueryRephraser
77-
reranker: Reranker
78-
79-
ingest_strategy: IngestStrategy
80-
parser_router: DocumentParserRouter
81-
enricher_router: ElementEnricherRouter
82-
8379
def __init__(
8480
self,
85-
vector_store: VectorStore,
81+
vector_store: VectorStore[VectorStoreOptionsT],
82+
*,
8683
query_rephraser: QueryRephraser | None = None,
87-
reranker: Reranker | None = None,
84+
reranker: Reranker[RerankerOptionsT] | None = None,
85+
default_options: DocumentSearchOptions[VectorStoreOptionsT, RerankerOptionsT] | None = None,
8886
ingest_strategy: IngestStrategy | None = None,
8987
parser_router: DocumentParserRouter | None = None,
9088
enricher_router: ElementEnricherRouter | None = None,
9189
) -> None:
90+
"""
91+
Initialize the DocumentSearch instance.
92+
93+
Args:
94+
vector_store: The vector store to use for retrieval.
95+
query_rephraser: The query rephraser to use for retrieval.
96+
reranker: The reranker to use for retrieval.
97+
default_options: The default options for the search.
98+
ingest_strategy: The ingestion strategy to use for ingestion.
99+
parser_router: The document parser router to use for ingestion.
100+
enricher_router: The element enricher router to use for ingestion.
101+
"""
102+
super().__init__(default_options=default_options)
92103
self.vector_store = vector_store
93104
self.query_rephraser = query_rephraser or NoopQueryRephraser()
94105
self.reranker = reranker or NoopReranker()
@@ -178,39 +189,47 @@ def preferred_subclass(
178189

179190
raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")
180191

181-
async def search(self, query: str, config: SearchConfig | None = None) -> Sequence[Element]:
192+
async def search(
193+
self,
194+
query: str,
195+
options: DocumentSearchOptions[VectorStoreOptionsT, RerankerOptionsT] | None = None,
196+
) -> Sequence[Element]:
182197
"""
183198
Search for the most relevant chunks for a query.
184199
185200
Args:
186201
query: The query to search for.
187-
config: The search configuration.
202+
options: The document search retrieval options.
188203
189204
Returns:
190205
A list of chunks.
191206
"""
192-
config = config or SearchConfig()
193-
queries = await self.query_rephraser.rephrase(query)
194-
with trace(queries=queries, config=config, vectore_store=self.vector_store, reranker=self.reranker) as outputs:
195-
elements = []
196-
197-
for rephrased_query in queries:
198-
results = await self.vector_store.retrieve(
199-
text=rephrased_query,
200-
options=VectorStoreOptions(**config.vector_store_kwargs),
201-
)
202-
elements.append([Element.from_vector_db_entry(result.entry, result.score) for result in results])
203-
204-
outputs.search_results = await self.reranker.rerank(
207+
merged_options = (self.default_options | options) if options else self.default_options
208+
vector_store_options = merged_options.vector_store_options or None
209+
reranker_options = merged_options.reranker_options or None
210+
211+
with trace(query=query, options=merged_options) as outputs:
212+
queries = await self.query_rephraser.rephrase(query)
213+
elements = [
214+
[
215+
Element.from_vector_db_entry(result.entry, result.score)
216+
for result in await self.vector_store.retrieve(query, vector_store_options)
217+
]
218+
for query in queries
219+
]
220+
outputs.results = await self.reranker.rerank(
205221
elements=elements,
206222
query=query,
207-
options=RerankerOptions(**config.reranker_kwargs),
223+
options=reranker_options,
208224
)
209-
return outputs.search_results
225+
226+
return outputs.results
210227

211228
@traceable
212229
async def ingest(
213-
self, documents: str | Iterable[DocumentMeta | Document | Source], fail_on_error: bool = True
230+
self,
231+
documents: str | Iterable[DocumentMeta | Document | Source],
232+
fail_on_error: bool = True,
214233
) -> IngestExecutionResult:
215234
"""
216235
Ingest documents into the search index.

packages/ragbits-document-search/src/ragbits/document_search/cli.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99
from ragbits.cli._utils import get_instance_or_exit
1010
from ragbits.cli.state import print_output
11-
from ragbits.document_search._main import DocumentSearch, SearchConfig
11+
from ragbits.core.vector_stores.base import VectorStoreOptions
12+
from ragbits.document_search._main import DocumentSearch, DocumentSearchOptions
1213

1314
ds_app = typer.Typer(no_args_is_help=True)
1415

@@ -84,8 +85,8 @@ async def run() -> None:
8485
if state.document_search is None:
8586
raise ValueError("Document search not initialized")
8687

87-
search_config = SearchConfig(vector_store_kwargs={"k": k})
88-
entries = await state.document_search.search(query, config=search_config)
88+
options: DocumentSearchOptions = DocumentSearchOptions(vector_store_options=VectorStoreOptions(k=k))
89+
entries = await state.document_search.search(query, options)
8990
print_output(entries, columns=columns)
9091

9192
asyncio.run(run())

0 commit comments

Comments
 (0)