diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 3acc91c..639fd78 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -73,6 +73,39 @@ paths: application/json: schema: $ref: '#/components/schemas/HTTPValidationError' + /api/hybrid-search/indexes/{index_name}/query: + post: + tags: + - lex-db + summary: Perform hybrid search combining semantic and full-text search + description: Perform hybrid search using RRF fusion. + operationId: hybrid_search + parameters: + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/HybridSearchRequest' + responses: + '200': + description: Successful Response + content: + application/json: + schema: + $ref: '#/components/schemas/HybridSearchResults' + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' /api/articles: get: tags: @@ -385,3 +418,84 @@ components: - results title: VectorSearchResults description: Result of a vector search. + HybridSearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + top_k_semantic: + type: integer + title: Top K Semantic + default: 50 + top_k_fts: + type: integer + title: Top K Fts + default: 50 + rrf_k: + type: integer + title: Rrf K + default: 60 + type: object + required: + - query_text + title: HybridSearchRequest + description: Hybrid search request model. + HybridSearchResults: + properties: + results: + items: + $ref: '#/components/schemas/LexDbHybridSearchSearchResult' + type: array + title: Results + type: object + required: + - results + title: HybridSearchResults + description: Results of a hybrid search. + LexDbHybridSearchSearchResult: + properties: + rank: + type: integer + title: Rank + article_id: + type: integer + title: Article Id + article_headword: + type: string + title: Article Headword + chunk_sequence: + type: integer + title: Chunk Sequence + chunk_text: + type: string + title: Chunk Text + rrf_score: + type: number + title: Rrf Score + semantic_rank: + anyOf: + - type: integer + - type: 'null' + title: Semantic Rank + fts_rank: + anyOf: + - type: integer + - type: 'null' + title: Fts Rank + source: + type: string + title: Source + type: object + required: + - rank + - article_id + - article_headword + - chunk_sequence + - chunk_text + - rrf_score + - source + title: LexDbHybridSearchSearchResult diff --git a/src/lex_llm/api/connectors/lex_db_connector.py b/src/lex_llm/api/connectors/lex_db_connector.py index 9d43a20..fe24cd8 100644 --- a/src/lex_llm/api/connectors/lex_db_connector.py +++ b/src/lex_llm/api/connectors/lex_db_connector.py @@ -7,6 +7,7 @@ from lex_db_api.api_client import ApiClient from lex_db_api.configuration import Configuration from lex_db_api.models.vector_search_request import VectorSearchRequest +from lex_db_api.models.hybrid_search_request import HybridSearchRequest lexdb_client = ApiClient( configuration=Configuration(host=os.getenv("DB_HOST", "http://localhost:8000")) @@ -59,3 +60,41 @@ async def vector_search( print(f"Error connecting to LexDB: {e}") # TODO: more robust error handling/logging return [] + + async def hybrid_search( + self, + query: str, + top_k: int = 10, + top_k_semantic: int = 50, + top_k_fts: int = 50, + rrf_k: int = 60, + index_name: str = "article_embeddings_e5", + ) -> List[LexArticle]: + """Performs hybrid search using RRF fusion via the lex-db API.""" + + try: + hybrid_req = HybridSearchRequest( + query_text=query, + top_k=top_k, + top_k_semantic=top_k_semantic, + top_k_fts=top_k_fts, + rrf_k=rrf_k, + ) + + hybrid_search_result = lexdb_api.hybrid_search(index_name, hybrid_req) + + if hybrid_search_result.results: + return [ + LexArticle( + id=result.article_id, + title=result.article_headword, + text=result.chunk_text, + url=f"https://lex.dk/{result.article_headword}", + ) + for result in hybrid_search_result.results + ] + + return [] + except httpx.RequestError as e: + print(f"Error connecting to LexDB: {e}") + return [] diff --git a/src/lex_llm/tools/search_knowledge_base.py b/src/lex_llm/tools/search_knowledge_base.py index c45c5fb..2399844 100644 --- a/src/lex_llm/tools/search_knowledge_base.py +++ b/src/lex_llm/tools/search_knowledge_base.py @@ -8,6 +8,7 @@ def search_knowledge_base( index_name: str = "openai_large_3_sections", top_k: int = 10, + search_method: str = "vector_search", ) -> Callable[[Dict[str, Any], EventEmitter], AsyncGenerator[None, None]]: """ Creates a knowledge base search step with the specified parameters. @@ -15,7 +16,7 @@ def search_knowledge_base( Args: index_name: The name of the vector index to search top_k: Number of top results to retrieve - + search_method: Search method to use - one of: "vector_search", "hybrid_search" Returns: An async generator function compatible with the Orchestrator """ @@ -27,9 +28,14 @@ async def search_knowledge_base( lex_db_connector = LexDBConnector() user_input = context.get("user_input", "") - documents = await lex_db_connector.vector_search( - query=user_input, top_k=top_k, index_name=index_name - ) + if search_method == "hybrid_search": + documents = await lex_db_connector.hybrid_search( + query=user_input, top_k=top_k, index_name=index_name + ) + else: # Default to vector_search + documents = await lex_db_connector.vector_search( + query=user_input, top_k=top_k, index_name=index_name + ) context["retrieved_docs"] = documents yield diff --git a/src/lex_llm/workflows/beta_workflow_v1_large.py b/src/lex_llm/workflows/beta_workflow_v1_large.py index 2782653..e48259f 100644 --- a/src/lex_llm/workflows/beta_workflow_v1_large.py +++ b/src/lex_llm/workflows/beta_workflow_v1_large.py @@ -14,6 +14,7 @@ def get_workflow(request: WorkflowRunRequest) -> Orchestrator: search_knowledge_base( index_name="article_embeddings_e5", top_k=10, + search_method="hybrid_search", ), generate_response_with_sources( llm_provider=OpenRouterProvider(