From 0fd95e9db0bb2a716cfd16ef00a78cc5a2a1d4ea Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 13:47:32 +0100 Subject: [PATCH 01/12] Implement Hybrid Search methods Implements an advanced search method: - Hybrid Search: RRF fusion of semantic and FTS5 keyword search - semantic search works with article_embeddings_e5 index - FTS5 search works with fts_article_embeddings_e5 index Features: - New API endpoint(hybrid_search) is added for the search method - New method in search_knowledge_base --- openapi/lex-db.yaml | 388 +----------------- .../api/connectors/lex_db_connector.py | 111 +++++ src/lex_llm/tools/search_knowledge_base.py | 22 +- .../workflows/beta_workflow_v1_large.py | 1 + 4 files changed, 131 insertions(+), 391 deletions(-) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 3acc91c..8ead37a 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -1,387 +1 @@ -openapi: 3.1.0 -info: - title: Lex DB API - description: A wrapper around a SQLite database for encyclopedia articles with vector - and full-text search - version: 0.1.0 -paths: - /: - get: - tags: - - Health - summary: Health Check - description: "Health check endpoint.\n\nReturns:\n dict: Health check information." - operationId: health_check__get - responses: - '200': - description: Successful Response - content: - application/json: - schema: - additionalProperties: true - type: object - title: Response Health Check Get - /api/tables: - get: - tags: - - lex-db - summary: Get a list of tables in the database - description: Get a list of tables in the database. - operationId: get_tables - responses: - '200': - description: Successful Response - content: - application/json: - schema: - additionalProperties: - items: - type: string - type: array - type: object - title: Response Get Tables - /api/vector-search/indexes/{index_name}/query: - post: - tags: - - lex-db - summary: Search a vector index for similar content to the query text - description: Search a vector index for similar content to the query text. - operationId: vector_search - parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/VectorSearchRequest' - responses: - '200': - description: Successful Response - content: - application/json: - schema: - $ref: '#/components/schemas/VectorSearchResults' - '422': - description: Validation Error - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - /api/articles: - get: - tags: - - lex-db - summary: "An endpoint for filtering articles based on metadata such as id, text\ - \ search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundet\xE5\ - rn, or GET /articles?ids=1&ids=2&ids=5)" - description: Filter articles based on metadata such as id, text search, etc. - operationId: get_articles - parameters: - - name: query - in: query - required: false - schema: - anyOf: - - type: string - - type: 'null' - description: Text search in articles - title: Query - description: Text search in articles - - name: ids - in: query - required: false - schema: - anyOf: - - type: string - - type: 'null' - description: List of article IDs (comma-separated, JSON list, or repeated) - title: Ids - description: List of article IDs (comma-separated, JSON list, or repeated) - - name: limit - in: query - required: false - schema: - type: integer - maximum: 100 - minimum: 1 - description: Maximum number of results - default: 50 - title: Limit - description: Maximum number of results - responses: - '200': - description: Successful Response - content: - application/json: - schema: - $ref: '#/components/schemas/SearchResults' - '422': - description: Validation Error - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - /api/vector-search/indexes: - get: - tags: - - lex-db - summary: List all vector indexes and their metadata - description: Return a list of all vector indexes and their metadata. - operationId: list_vector_indexes - responses: - '200': - description: Successful Response - content: - application/json: - schema: - items: - additionalProperties: true - type: object - type: array - title: Response List Vector Indexes - /api/vector-search/indexes/{index_name}: - get: - tags: - - lex-db - summary: Get metadata for a specific vector index - description: Return metadata for a specific vector index. - operationId: get_vector_index - parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: object - additionalProperties: true - title: Response Get Vector Index - '422': - description: Validation Error - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - /api/benchmark/embeddings: - post: - tags: - - lex-db - summary: Benchmark embedding generation performance - description: Benchmark embedding generation with configurable parameters. - operationId: benchmark_embeddings - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/BenchmarkEmbeddingsRequest' - required: true - responses: - '200': - description: Successful Response - content: - application/json: - schema: - $ref: '#/components/schemas/BenchmarkEmbeddingsResponse' - '422': - description: Validation Error - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' -components: - schemas: - BenchmarkEmbeddingsRequest: - properties: - model_choice: - $ref: '#/components/schemas/EmbeddingModel' - default: intfloat/multilingual-e5-large - num_texts: - type: integer - title: Num Texts - default: 50 - text_length: - type: integer - title: Text Length - default: 200 - type: object - title: BenchmarkEmbeddingsRequest - BenchmarkEmbeddingsResponse: - properties: - num_texts: - type: integer - title: Num Texts - avg_text_length: - type: integer - title: Avg Text Length - total_time_seconds: - type: number - title: Total Time Seconds - texts_per_second: - type: number - title: Texts Per Second - ms_per_text: - type: number - title: Ms Per Text - embedding_dimension: - type: integer - title: Embedding Dimension - type: object - required: - - num_texts - - avg_text_length - - total_time_seconds - - texts_per_second - - ms_per_text - - embedding_dimension - title: BenchmarkEmbeddingsResponse - EmbeddingModel: - type: string - enum: - - intfloat/multilingual-e5-large - - text-embedding-ada-002 - - text-embedding-3-small - - text-embedding-3-large - - mock_model - title: EmbeddingModel - description: Supported embedding models. - HTTPValidationError: - properties: - detail: - items: - $ref: '#/components/schemas/ValidationError' - type: array - title: Detail - type: object - title: HTTPValidationError - SearchResult: - properties: - id: - type: integer - title: Id - xhtml_md: - type: string - title: Xhtml Md - rank: - type: number - title: Rank - url: - anyOf: - - type: string - - type: 'null' - title: Url - title: - type: string - title: Title - type: object - required: - - id - - xhtml_md - - rank - - title - title: SearchResult - description: Single result from a search. - SearchResults: - properties: - entries: - items: - $ref: '#/components/schemas/SearchResult' - type: array - title: Entries - total: - type: integer - title: Total - limit: - type: integer - title: Limit - type: object - required: - - entries - - total - - limit - title: SearchResults - description: Results of a search. - ValidationError: - properties: - loc: - items: - anyOf: - - type: string - - type: integer - type: array - title: Location - msg: - type: string - title: Message - type: - type: string - title: Error Type - type: object - required: - - loc - - msg - - type - title: ValidationError - VectorSearchRequest: - properties: - query_text: - type: string - title: Query Text - top_k: - type: integer - title: Top K - default: 5 - type: object - required: - - query_text - title: VectorSearchRequest - description: Vector search request model. - VectorSearchResult: - properties: - id_in_index: - type: integer - title: Id In Index - source_article_id: - type: string - title: Source Article Id - chunk_seq: - type: integer - title: Chunk Seq - chunk_text: - type: string - title: Chunk Text - distance: - type: number - title: Distance - type: object - required: - - id_in_index - - source_article_id - - chunk_seq - - chunk_text - - distance - title: VectorSearchResult - description: Result of a vector search. - VectorSearchResults: - properties: - results: - items: - $ref: '#/components/schemas/VectorSearchResult' - type: array - title: Results - type: object - required: - - results - title: VectorSearchResults - description: Result of a vector search. +{"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} \ No newline at end of file diff --git a/src/lex_llm/api/connectors/lex_db_connector.py b/src/lex_llm/api/connectors/lex_db_connector.py index 9d43a20..4e2d45b 100644 --- a/src/lex_llm/api/connectors/lex_db_connector.py +++ b/src/lex_llm/api/connectors/lex_db_connector.py @@ -7,6 +7,10 @@ from lex_db_api.api_client import ApiClient from lex_db_api.configuration import Configuration from lex_db_api.models.vector_search_request import VectorSearchRequest +from lex_db_api.models.hybrid_search_request import HybridSearchRequest +from lex_db_api.models.hy_de_search_request import HyDESearchRequest +from lex_db_api.models.hybrid_hy_de_search_request import HybridHyDESearchRequest + lexdb_client = ApiClient( configuration=Configuration(host=os.getenv("DB_HOST", "http://localhost:8000")) @@ -59,3 +63,110 @@ async def vector_search( print(f"Error connecting to LexDB: {e}") # TODO: more robust error handling/logging return [] + + async def hybrid_search( + self, + query: str, + top_k: int = 10, + top_k_semantic: int = 50, + top_k_fts: int = 50, + rrf_k: int = 60, + index_name: str = "article_embeddings_e5", + ) -> List[LexArticle]: + """Performs hybrid search using RRF fusion via the lex-db API.""" + + try: + hybrid_req = HybridSearchRequest( + query_text=query, + top_k=top_k, + top_k_semantic=top_k_semantic, + top_k_fts=top_k_fts, + rrf_k=rrf_k, + ) + + hybrid_search_result = lexdb_api.hybrid_search(index_name, hybrid_req) + + if hybrid_search_result.results: + return [ + LexArticle( + id=result.article_id, + title=result.article_headword, + text=result.chunk_text, + url=f"https://lex.dk/{result.article_headword}", + ) + for result in hybrid_search_result.results + ] + + return [] + except httpx.RequestError as e: + print(f"Error connecting to LexDB: {e}") + return [] + + async def hyde_search( + self, + query: str, + top_k: int = 10, + index_name: str = "article_embeddings_e5", + ) -> List[LexArticle]: + """Performs HyDE search via the lex-db API.""" + + try: + hyde_req = HyDESearchRequest(query_text=query, top_k=top_k) + + hyde_search_result = lexdb_api.hyde_search(index_name, hyde_req) + + if hyde_search_result: + return [ + LexArticle( + id=result["article_id"], + title=result["headword"], + text=result["text"], + url=f"https://lex.dk/{result['headword']}", + ) + for result in hyde_search_result + ] + + return [] + except httpx.RequestError as e: + print(f"Error connecting to LexDB: {e}") + return [] + + async def hybrid_hyde_search( + self, + query: str, + top_k: int = 10, + top_k_hyde: int = 50, + top_k_fts: int = 50, + rrf_k: int = 60, + index_name: str = "article_embeddings_e5", + ) -> List[LexArticle]: + """Performs hybrid HyDE search with adaptive RRF weighting.""" + + try: + hybrid_hyde_req = HybridHyDESearchRequest( + query_text=query, + top_k=top_k, + top_k_hyde=top_k_hyde, + top_k_fts=top_k_fts, + rrf_k=rrf_k, + ) + + hybrid_hyde_result = lexdb_api.hybrid_hyde_search( + index_name, hybrid_hyde_req + ) + + if hybrid_hyde_result: + return [ + LexArticle( + id=result.article_id, + title=result.article_headword, + text=result.chunk_text, + url=f"https://lex.dk/{result.article_headword}", + ) + for result in hybrid_hyde_result + ] + + return [] + except httpx.RequestError as e: + print(f"Error connecting to LexDB: {e}") + return [] diff --git a/src/lex_llm/tools/search_knowledge_base.py b/src/lex_llm/tools/search_knowledge_base.py index c45c5fb..15f8b17 100644 --- a/src/lex_llm/tools/search_knowledge_base.py +++ b/src/lex_llm/tools/search_knowledge_base.py @@ -8,6 +8,7 @@ def search_knowledge_base( index_name: str = "openai_large_3_sections", top_k: int = 10, + search_method: str = "vector_search", ) -> Callable[[Dict[str, Any], EventEmitter], AsyncGenerator[None, None]]: """ Creates a knowledge base search step with the specified parameters. @@ -15,7 +16,7 @@ def search_knowledge_base( Args: index_name: The name of the vector index to search top_k: Number of top results to retrieve - + search_method: Search method to use - one of: "vector_search", "hybrid_search", "hyde_search", "hybrid_hyde_search" Returns: An async generator function compatible with the Orchestrator """ @@ -27,9 +28,22 @@ async def search_knowledge_base( lex_db_connector = LexDBConnector() user_input = context.get("user_input", "") - documents = await lex_db_connector.vector_search( - query=user_input, top_k=top_k, index_name=index_name - ) + if search_method == "hybrid_search": + documents = await lex_db_connector.hybrid_search( + query=user_input, top_k=top_k, index_name=index_name + ) + elif search_method == "hyde_search": + documents = await lex_db_connector.hyde_search( + query=user_input, top_k=top_k, index_name=index_name + ) + elif search_method == "hybrid_hyde_search": + documents = await lex_db_connector.hybrid_hyde_search( + query=user_input, top_k=top_k, index_name=index_name + ) + else: # Default to vector_search + documents = await lex_db_connector.vector_search( + query=user_input, top_k=top_k, index_name=index_name + ) context["retrieved_docs"] = documents yield diff --git a/src/lex_llm/workflows/beta_workflow_v1_large.py b/src/lex_llm/workflows/beta_workflow_v1_large.py index 2782653..da0ca53 100644 --- a/src/lex_llm/workflows/beta_workflow_v1_large.py +++ b/src/lex_llm/workflows/beta_workflow_v1_large.py @@ -14,6 +14,7 @@ def get_workflow(request: WorkflowRunRequest) -> Orchestrator: search_knowledge_base( index_name="article_embeddings_e5", top_k=10, + search_method="hybrid_hyde_search", ), generate_response_with_sources( llm_provider=OpenRouterProvider( From 352d216103b0832f4aa96c5276f75b0e997c5fe9 Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 13:57:29 +0100 Subject: [PATCH 02/12] Update beta workflow to integrate hybrid search --- src/lex_llm/workflows/beta_workflow_v1_large.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lex_llm/workflows/beta_workflow_v1_large.py b/src/lex_llm/workflows/beta_workflow_v1_large.py index da0ca53..2e1e51c 100644 --- a/src/lex_llm/workflows/beta_workflow_v1_large.py +++ b/src/lex_llm/workflows/beta_workflow_v1_large.py @@ -14,7 +14,7 @@ def get_workflow(request: WorkflowRunRequest) -> Orchestrator: search_knowledge_base( index_name="article_embeddings_e5", top_k=10, - search_method="hybrid_hyde_search", + search_method = "hybrid_search" ), generate_response_with_sources( llm_provider=OpenRouterProvider( @@ -67,4 +67,4 @@ def get_metadata() -> dict: "openrouter", "gemma", ], - } + } \ No newline at end of file From b4267413bbc4f17216688d3e4a8bdd8a069aa5ef Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 14:15:06 +0100 Subject: [PATCH 03/12] yaml file is updated --- openapi/lex-db.yaml | 583 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 582 insertions(+), 1 deletion(-) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 8ead37a..279688b 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -1 +1,582 @@ -{"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} \ No newline at end of file +openapi: 3.1.0 +info: + title: Lex DB API + description: A wrapper around a SQLite database for encyclopedia articles with vector and full-text search + version: 0.1.0 +paths: + /: + get: + tags: + - Health + summary: Health Check + description: | + Health check endpoint. + + Returns: + dict: Health check information. + operationId: health_check__get + responses: + '200': + description: Successful Response + content: + application/json: + schema: + additionalProperties: true + type: object + title: Response Health Check Get + /api/tables: + get: + tags: + - lex-db + summary: Get a list of tables in the database + description: Get a list of tables in the database. + operationId: get_tables + responses: + '200': + description: Successful Response + content: + application/json: + schema: + additionalProperties: + items: + type: string + type: array + type: object + title: Response Get Tables + /api/vector-search/indexes/{index_name}/query: + post: + tags: + - lex-db + summary: Search a vector index for similar content to the query text + description: Search a vector index for similar content to the query text. + operationId: vector_search + parameters: + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/VectorSearchRequest' + responses: + '200': + description: Successful Response + content: + application/json: + schema: + $ref: '#/components/schemas/VectorSearchResults' + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + /api/hybrid-search/indexes/{index_name}/query: + post: + tags: + - lex-db + summary: Hybrid search combining semantic and keyword search with RRF fusion + description: Perform hybrid search using RRF fusion of semantic and keyword search. + operationId: hybrid_search + parameters: + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/HybridSearchRequest' + responses: + '200': + description: Successful Response + content: + application/json: + schema: + $ref: '#/components/schemas/HybridSearchResults' + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + /api/hyde-search/indexes/{index_name}/query: + post: + tags: + - lex-db + summary: HyDE search using LLM-generated hypothetical document + description: Perform HyDE search; generate hypothetical document, embed it, and search. + operationId: hyde_search + parameters: + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/HyDESearchRequest' + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: array + items: + type: object + additionalProperties: true + title: Response Hyde Search + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + /api/hybrid-hyde-search/indexes/{index_name}/query: + post: + tags: + - lex-db + summary: Adaptive hybrid search with HyDE + FTS and query-type based weighting + description: Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting. + operationId: hybrid_hyde_search + parameters: + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/HybridHyDESearchRequest' + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/HybridHyDESearchResults' + title: Response Hybrid Hyde Search + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + /api/articles: + get: + tags: + - lex-db + summary: An endpoint for filtering articles based on metadata + description: Filter articles based on metadata such as id, text search, etc. + operationId: get_articles + parameters: + - name: query + in: query + required: false + schema: + anyOf: + - type: string + - type: 'null' + description: Text search in articles + title: Query + description: Text search in articles + - name: ids + in: query + required: false + schema: + anyOf: + - type: string + - type: 'null' + description: List of article IDs (comma-separated, JSON list, or repeated) + title: Ids + description: List of article IDs (comma-separated, JSON list, or repeated) + - name: limit + in: query + required: false + schema: + type: integer + maximum: 100 + minimum: 1 + description: Maximum number of results + default: 50 + title: Limit + description: Maximum number of results + responses: + '200': + description: Successful Response + content: + application/json: + schema: + $ref: '#/components/schemas/SearchResults' + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + /api/vector-search/indexes: + get: + tags: + - lex-db + summary: List all vector indexes and their metadata + description: Return a list of all vector indexes and their metadata. + operationId: list_vector_indexes + responses: + '200': + description: Successful Response + content: + application/json: + schema: + items: + additionalProperties: true + type: object + type: array + title: Response List Vector Indexes + /api/vector-search/indexes/{index_name}: + get: + tags: + - lex-db + summary: Get metadata for a specific vector index + description: Return metadata for a specific vector index. + operationId: get_vector_index + parameters: + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name + responses: + '200': + description: Successful Response + content: + application/json: + schema: + type: object + additionalProperties: true + title: Response Get Vector Index + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' +components: + schemas: + HTTPValidationError: + properties: + detail: + items: + $ref: '#/components/schemas/ValidationError' + type: array + title: Detail + type: object + title: HTTPValidationError + HyDESearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + type: object + required: + - query_text + title: HyDESearchRequest + description: HyDE search request model. + HybridHyDESearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + top_k_hyde: + type: integer + title: Top K Hyde + default: 50 + top_k_fts: + type: integer + title: Top K Fts + default: 50 + rrf_k: + type: integer + title: Rrf K + default: 60 + type: object + required: + - query_text + title: HybridHyDESearchRequest + description: Adaptive hybrid search request model. + HybridHyDESearchResults: + properties: + rank: + type: integer + title: Rank + article_id: + type: integer + title: Article Id + article_headword: + type: string + title: Article Headword + chunk_sequence: + type: integer + title: Chunk Sequence + chunk_text: + type: string + title: Chunk Text + rrf_score: + type: number + title: Rrf Score + hyde_rank: + anyOf: + - type: integer + - type: 'null' + title: Hyde Rank + fts_rank: + anyOf: + - type: integer + - type: 'null' + title: Fts Rank + source: + type: string + title: Source + type: object + required: + - rank + - article_id + - article_headword + - chunk_sequence + - chunk_text + - rrf_score + - hyde_rank + - fts_rank + - source + title: HybridHyDESearchResults + HybridSearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + top_k_semantic: + type: integer + title: Top K Semantic + default: 50 + top_k_fts: + type: integer + title: Top K Fts + default: 50 + rrf_k: + type: integer + title: Rrf K + default: 60 + type: object + required: + - query_text + title: HybridSearchRequest + description: Hybrid search request model. + HybridSearchResults: + properties: + results: + items: + $ref: '#/components/schemas/lex_db__hybrid_search__SearchResult' + type: array + title: Results + type: object + required: + - results + title: HybridSearchResults + description: Results of a hybrid search. + SearchResults: + properties: + entries: + items: + $ref: '#/components/schemas/lex_db__database__SearchResult' + type: array + title: Entries + total: + type: integer + title: Total + limit: + type: integer + title: Limit + type: object + required: + - entries + - total + - limit + title: SearchResults + description: Results of a search. + ValidationError: + properties: + loc: + items: + anyOf: + - type: string + - type: integer + type: array + title: Location + msg: + type: string + title: Message + type: + type: string + title: Error Type + type: object + required: + - loc + - msg + - type + title: ValidationError + VectorSearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 5 + type: object + required: + - query_text + title: VectorSearchRequest + description: Vector search request model. + VectorSearchResult: + properties: + id_in_index: + type: integer + title: Id In Index + source_article_id: + type: string + title: Source Article Id + chunk_seq: + type: integer + title: Chunk Seq + chunk_text: + type: string + title: Chunk Text + distance: + type: number + title: Distance + type: object + required: + - id_in_index + - source_article_id + - chunk_seq + - chunk_text + - distance + title: VectorSearchResult + description: Result of a vector search. + VectorSearchResults: + properties: + results: + items: + $ref: '#/components/schemas/VectorSearchResult' + type: array + title: Results + type: object + required: + - results + title: VectorSearchResults + description: Result of a vector search. + lex_db__database__SearchResult: + properties: + id: + type: integer + title: Id + xhtml_md: + type: string + title: Xhtml Md + rank: + type: number + title: Rank + url: + anyOf: + - type: string + - type: 'null' + title: Url + title: + type: string + title: Title + type: object + required: + - id + - xhtml_md + - rank + - title + title: SearchResult + description: Single result from a search. + lex_db__hybrid_search__SearchResult: + properties: + rank: + type: integer + title: Rank + article_id: + type: integer + title: Article Id + article_headword: + type: string + title: Article Headword + chunk_sequence: + type: integer + title: Chunk Sequence + chunk_text: + type: string + title: Chunk Text + rrf_score: + type: number + title: Rrf Score + semantic_rank: + anyOf: + - type: integer + - type: 'null' + title: Semantic Rank + fts_rank: + anyOf: + - type: integer + - type: 'null' + title: Fts Rank + source: + type: string + title: Source + type: object + required: + - rank + - article_id + - article_headword + - chunk_sequence + - chunk_text + - rrf_score + - semantic_rank + - fts_rank + - source + title: SearchResult \ No newline at end of file From cd243adb2d039ed38642abf887bfbf95ea04f3da Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 14:24:33 +0100 Subject: [PATCH 04/12] Update beta workflow and integrate hybrid search --- src/lex_llm/workflows/beta_workflow_v1_large.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lex_llm/workflows/beta_workflow_v1_large.py b/src/lex_llm/workflows/beta_workflow_v1_large.py index 2e1e51c..e48259f 100644 --- a/src/lex_llm/workflows/beta_workflow_v1_large.py +++ b/src/lex_llm/workflows/beta_workflow_v1_large.py @@ -14,7 +14,7 @@ def get_workflow(request: WorkflowRunRequest) -> Orchestrator: search_knowledge_base( index_name="article_embeddings_e5", top_k=10, - search_method = "hybrid_search" + search_method="hybrid_search", ), generate_response_with_sources( llm_provider=OpenRouterProvider( @@ -67,4 +67,4 @@ def get_metadata() -> dict: "openrouter", "gemma", ], - } \ No newline at end of file + } From f15b30a05a234cce45ee5176a607d3b751cd2884 Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 14:40:42 +0100 Subject: [PATCH 05/12] Apply local workflow and YAML changes before cherry-pick --- openapi/lex-db.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 279688b..4d7c3e8 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -579,4 +579,5 @@ components: - semantic_rank - fts_rank - source - title: SearchResult \ No newline at end of file + title: SearchResult +{"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} From 0368392361d84bb77cc33570e81822870c5968f4 Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 13:47:32 +0100 Subject: [PATCH 06/12] Implement Hybrid Search methods Implements an advanced search method: - Hybrid Search: RRF fusion of semantic and FTS5 keyword search - semantic search works with article_embeddings_e5 index - FTS5 search works with fts_article_embeddings_e5 index Features: - New API endpoint(hybrid_search) is added for the search method - New method in search_knowledge_base --- openapi/lex-db.yaml | 1 + src/lex_llm/workflows/beta_workflow_v1_large.py | 1 + 2 files changed, 2 insertions(+) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 4d7c3e8..7611f27 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -581,3 +581,4 @@ components: - source title: SearchResult {"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} +{"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} diff --git a/src/lex_llm/workflows/beta_workflow_v1_large.py b/src/lex_llm/workflows/beta_workflow_v1_large.py index e48259f..7677b3b 100644 --- a/src/lex_llm/workflows/beta_workflow_v1_large.py +++ b/src/lex_llm/workflows/beta_workflow_v1_large.py @@ -15,6 +15,7 @@ def get_workflow(request: WorkflowRunRequest) -> Orchestrator: index_name="article_embeddings_e5", top_k=10, search_method="hybrid_search", + search_method="hybrid_hyde_search", ), generate_response_with_sources( llm_provider=OpenRouterProvider( From 56ab043ffc0a4c5e955cfd55467717c45263faa4 Mon Sep 17 00:00:00 2001 From: Zafar Date: Tue, 16 Dec 2025 14:48:32 +0100 Subject: [PATCH 07/12] Fix search method in beta_workflow_v1_large.py Corrected the search method from 'hybrid_hyde_search' to 'hybrid_search'. --- src/lex_llm/workflows/beta_workflow_v1_large.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lex_llm/workflows/beta_workflow_v1_large.py b/src/lex_llm/workflows/beta_workflow_v1_large.py index 7677b3b..e48259f 100644 --- a/src/lex_llm/workflows/beta_workflow_v1_large.py +++ b/src/lex_llm/workflows/beta_workflow_v1_large.py @@ -15,7 +15,6 @@ def get_workflow(request: WorkflowRunRequest) -> Orchestrator: index_name="article_embeddings_e5", top_k=10, search_method="hybrid_search", - search_method="hybrid_hyde_search", ), generate_response_with_sources( llm_provider=OpenRouterProvider( From c9c567cb7d495db876c923effb27cc1942226d48 Mon Sep 17 00:00:00 2001 From: zafarhussain87 Date: Tue, 16 Dec 2025 13:47:32 +0100 Subject: [PATCH 08/12] Implement Hybrid Search methods Implements an advanced search method: - Hybrid Search: RRF fusion of semantic and FTS5 keyword search - semantic search works with article_embeddings_e5 index - FTS5 search works with fts_article_embeddings_e5 index Features: - New API endpoint(hybrid_search) is added for the search method - New method in search_knowledge_base --- openapi/lex-db.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 7611f27..279688b 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -579,6 +579,4 @@ components: - semantic_rank - fts_rank - source - title: SearchResult -{"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} -{"openapi":"3.1.0","info":{"title":"Lex DB API","description":"A wrapper around a SQLite database for encyclopedia articles with vector and full-text search","version":"0.1.0"},"paths":{"/":{"get":{"tags":["Health"],"summary":"Health Check","description":"Health check endpoint.\n\nReturns:\n dict: Health check information.","operationId":"health_check__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":true,"type":"object","title":"Response Health Check Get"}}}}}}},"/api/tables":{"get":{"tags":["lex-db"],"summary":"Get a list of tables in the database","description":"Get a list of tables in the database.","operationId":"get_tables","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"items":{"type":"string"},"type":"array"},"type":"object","title":"Response Get Tables"}}}}}}},"/api/vector-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Search a vector index for similar content to the query text","description":"Search a vector index for similar content to the query text.","operationId":"vector_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VectorSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Hybrid search combining semantic and keyword search with RRF fusion","description":"Perform hybrid search using RRF fusion of semantic and keyword search.","operationId":"hybrid_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridSearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"HyDE search using LLM-generated hypothetical document","description":"Perform HyDE search: generate hypothetical document, embed it, and search.","operationId":"hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"type":"object","additionalProperties":true},"title":"Response Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/hybrid-hyde-search/indexes/{index_name}/query":{"post":{"tags":["lex-db"],"summary":"Adaptive hybrid search with HyDE + FTS and query-type based weighting","description":"Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting.","operationId":"hybrid_hyde_search","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/HybridHyDESearchRequest"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/HybridHyDESearchResults"},"title":"Response Hybrid Hyde Search"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/articles":{"get":{"tags":["lex-db"],"summary":"An endpoint for filtering articles based on metadata such as id, text search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundetårn, or GET /articles?ids=1&ids=2&ids=5)","description":"Filter articles based on metadata such as id, text search, etc.","operationId":"get_articles","parameters":[{"name":"query","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"Text search in articles","title":"Query"},"description":"Text search in articles"},{"name":"ids","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"description":"List of article IDs (comma-separated, JSON list, or repeated)","title":"Ids"},"description":"List of article IDs (comma-separated, JSON list, or repeated)"},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","maximum":100,"minimum":1,"description":"Maximum number of results","default":50,"title":"Limit"},"description":"Maximum number of results"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SearchResults"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/vector-search/indexes":{"get":{"tags":["lex-db"],"summary":"List all vector indexes and their metadata","description":"Return a list of all vector indexes and their metadata.","operationId":"list_vector_indexes","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"items":{"additionalProperties":true,"type":"object"},"type":"array","title":"Response List Vector Indexes"}}}}}}},"/api/vector-search/indexes/{index_name}":{"get":{"tags":["lex-db"],"summary":"Get metadata for a specific vector index","description":"Return metadata for a specific vector index.","operationId":"get_vector_index","parameters":[{"name":"index_name","in":"path","required":true,"schema":{"type":"string","title":"Index Name"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"object","additionalProperties":true,"title":"Response Get Vector Index"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10}},"type":"object","required":["query_text"],"title":"HyDESearchRequest","description":"HyDE search request model."},"HybridHyDESearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_hyde":{"type":"integer","title":"Top K Hyde","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridHyDESearchRequest","description":"Adaptive hybrid search request model."},"HybridHyDESearchResults":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"hyde_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Hyde Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","hyde_rank","fts_rank","source"],"title":"HybridHyDESearchResults"},"HybridSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":10},"top_k_semantic":{"type":"integer","title":"Top K Semantic","default":50},"top_k_fts":{"type":"integer","title":"Top K Fts","default":50},"rrf_k":{"type":"integer","title":"Rrf K","default":60}},"type":"object","required":["query_text"],"title":"HybridSearchRequest","description":"Hybrid search request model."},"HybridSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/lex_db__hybrid_search__SearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"HybridSearchResults","description":"Results of a hybrid search."},"SearchResults":{"properties":{"entries":{"items":{"$ref":"#/components/schemas/lex_db__database__SearchResult"},"type":"array","title":"Entries"},"total":{"type":"integer","title":"Total"},"limit":{"type":"integer","title":"Limit"}},"type":"object","required":["entries","total","limit"],"title":"SearchResults","description":"Results of a search."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VectorSearchRequest":{"properties":{"query_text":{"type":"string","title":"Query Text"},"top_k":{"type":"integer","title":"Top K","default":5}},"type":"object","required":["query_text"],"title":"VectorSearchRequest","description":"Vector search request model."},"VectorSearchResult":{"properties":{"id_in_index":{"type":"integer","title":"Id In Index"},"source_article_id":{"type":"string","title":"Source Article Id"},"chunk_seq":{"type":"integer","title":"Chunk Seq"},"chunk_text":{"type":"string","title":"Chunk Text"},"distance":{"type":"number","title":"Distance"}},"type":"object","required":["id_in_index","source_article_id","chunk_seq","chunk_text","distance"],"title":"VectorSearchResult","description":"Result of a vector search."},"VectorSearchResults":{"properties":{"results":{"items":{"$ref":"#/components/schemas/VectorSearchResult"},"type":"array","title":"Results"}},"type":"object","required":["results"],"title":"VectorSearchResults","description":"Result of a vector search."},"lex_db__database__SearchResult":{"properties":{"id":{"type":"integer","title":"Id"},"xhtml_md":{"type":"string","title":"Xhtml Md"},"rank":{"type":"number","title":"Rank"},"url":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Url"},"title":{"type":"string","title":"Title"}},"type":"object","required":["id","xhtml_md","rank","title"],"title":"SearchResult","description":"Single result from a search."},"lex_db__hybrid_search__SearchResult":{"properties":{"rank":{"type":"integer","title":"Rank"},"article_id":{"type":"integer","title":"Article Id"},"article_headword":{"type":"string","title":"Article Headword"},"chunk_sequence":{"type":"integer","title":"Chunk Sequence"},"chunk_text":{"type":"string","title":"Chunk Text"},"rrf_score":{"type":"number","title":"Rrf Score"},"semantic_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Semantic Rank"},"fts_rank":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Fts Rank"},"source":{"type":"string","title":"Source"}},"type":"object","required":["rank","article_id","article_headword","chunk_sequence","chunk_text","rrf_score","semantic_rank","fts_rank","source"],"title":"SearchResult"}}}} + title: SearchResult \ No newline at end of file From 460ee6bb5a106c863569979e638e55501f3bf638 Mon Sep 17 00:00:00 2001 From: Zafar Date: Tue, 16 Dec 2025 16:23:24 +0100 Subject: [PATCH 09/12] Update lex-db.yaml --- openapi/lex-db.yaml | 532 +++++++++++++++++++++++--------------------- 1 file changed, 281 insertions(+), 251 deletions(-) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index 279688b..fc1f70c 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -1,19 +1,16 @@ openapi: 3.1.0 info: title: Lex DB API - description: A wrapper around a SQLite database for encyclopedia articles with vector and full-text search + description: A wrapper around a SQLite database for encyclopedia articles with vector + and full-text search version: 0.1.0 paths: /: get: tags: - - Health + - Health summary: Health Check - description: | - Health check endpoint. - - Returns: - dict: Health check information. + description: "Health check endpoint.\n\nReturns:\n dict: Health check information." operationId: health_check__get responses: '200': @@ -27,7 +24,7 @@ paths: /api/tables: get: tags: - - lex-db + - lex-db summary: Get a list of tables in the database description: Get a list of tables in the database. operationId: get_tables @@ -46,17 +43,17 @@ paths: /api/vector-search/indexes/{index_name}/query: post: tags: - - lex-db + - lex-db summary: Search a vector index for similar content to the query text description: Search a vector index for similar content to the query text. operationId: vector_search parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name requestBody: required: true content: @@ -79,17 +76,17 @@ paths: /api/hybrid-search/indexes/{index_name}/query: post: tags: - - lex-db - summary: Hybrid search combining semantic and keyword search with RRF fusion - description: Perform hybrid search using RRF fusion of semantic and keyword search. + - lex-db + summary: Perform hybrid search combining semantic and full-text search + description: Perform hybrid search using RRF fusion. operationId: hybrid_search parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name requestBody: required: true content: @@ -112,17 +109,17 @@ paths: /api/hyde-search/indexes/{index_name}/query: post: tags: - - lex-db - summary: HyDE search using LLM-generated hypothetical document - description: Perform HyDE search; generate hypothetical document, embed it, and search. + - lex-db + summary: Perform HyDE (Hypothetical Document Embeddings) search + description: Perform HyDE search via the lex-db API. operationId: hyde_search parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name requestBody: required: true content: @@ -139,7 +136,6 @@ paths: items: type: object additionalProperties: true - title: Response Hyde Search '422': description: Validation Error content: @@ -149,17 +145,17 @@ paths: /api/hybrid-hyde-search/indexes/{index_name}/query: post: tags: - - lex-db - summary: Adaptive hybrid search with HyDE + FTS and query-type based weighting - description: Perform adaptive hybrid search using HyDE + FTS with adaptive RRF weighting. + - lex-db + summary: Perform hybrid HyDE search with adaptive RRF weighting + description: Perform hybrid HyDE search combining HyDE and full-text search. operationId: hybrid_hyde_search parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name requestBody: required: true content: @@ -174,8 +170,7 @@ paths: schema: type: array items: - $ref: '#/components/schemas/HybridHyDESearchResults' - title: Response Hybrid Hyde Search + $ref: '#/components/schemas/LexDbHybridSearchSearchResult' '422': description: Validation Error content: @@ -185,42 +180,44 @@ paths: /api/articles: get: tags: - - lex-db - summary: An endpoint for filtering articles based on metadata + - lex-db + summary: "An endpoint for filtering articles based on metadata such as id, text\ + \ search, etc. Query parameters are used for filtering (e.g. GET /articles?query=Rundet\xE5\ + rn, or GET /articles?ids=1&ids=2&ids=5)" description: Filter articles based on metadata such as id, text search, etc. operationId: get_articles parameters: - - name: query - in: query - required: false - schema: - anyOf: - - type: string - - type: 'null' - description: Text search in articles - title: Query + - name: query + in: query + required: false + schema: + anyOf: + - type: string + - type: 'null' description: Text search in articles - - name: ids - in: query - required: false - schema: - anyOf: - - type: string - - type: 'null' - description: List of article IDs (comma-separated, JSON list, or repeated) - title: Ids + title: Query + description: Text search in articles + - name: ids + in: query + required: false + schema: + anyOf: + - type: string + - type: 'null' description: List of article IDs (comma-separated, JSON list, or repeated) - - name: limit - in: query - required: false - schema: - type: integer - maximum: 100 - minimum: 1 - description: Maximum number of results - default: 50 - title: Limit + title: Ids + description: List of article IDs (comma-separated, JSON list, or repeated) + - name: limit + in: query + required: false + schema: + type: integer + maximum: 100 + minimum: 1 description: Maximum number of results + default: 50 + title: Limit + description: Maximum number of results responses: '200': description: Successful Response @@ -237,7 +234,7 @@ paths: /api/vector-search/indexes: get: tags: - - lex-db + - lex-db summary: List all vector indexes and their metadata description: Return a list of all vector indexes and their metadata. operationId: list_vector_indexes @@ -255,17 +252,17 @@ paths: /api/vector-search/indexes/{index_name}: get: tags: - - lex-db + - lex-db summary: Get metadata for a specific vector index description: Return metadata for a specific vector index. operationId: get_vector_index parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name + - name: index_name + in: path + required: true + schema: + type: string + title: Index Name responses: '200': description: Successful Response @@ -281,8 +278,88 @@ paths: application/json: schema: $ref: '#/components/schemas/HTTPValidationError' + /api/benchmark/embeddings: + post: + tags: + - lex-db + summary: Benchmark embedding generation performance + description: Benchmark embedding generation with configurable parameters. + operationId: benchmark_embeddings + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/BenchmarkEmbeddingsRequest' + required: true + responses: + '200': + description: Successful Response + content: + application/json: + schema: + $ref: '#/components/schemas/BenchmarkEmbeddingsResponse' + '422': + description: Validation Error + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' components: schemas: + BenchmarkEmbeddingsRequest: + properties: + model_choice: + $ref: '#/components/schemas/EmbeddingModel' + default: intfloat/multilingual-e5-large + num_texts: + type: integer + title: Num Texts + default: 50 + text_length: + type: integer + title: Text Length + default: 200 + type: object + title: BenchmarkEmbeddingsRequest + BenchmarkEmbeddingsResponse: + properties: + num_texts: + type: integer + title: Num Texts + avg_text_length: + type: integer + title: Avg Text Length + total_time_seconds: + type: number + title: Total Time Seconds + texts_per_second: + type: number + title: Texts Per Second + ms_per_text: + type: number + title: Ms Per Text + embedding_dimension: + type: integer + title: Embedding Dimension + type: object + required: + - num_texts + - avg_text_length + - total_time_seconds + - texts_per_second + - ms_per_text + - embedding_dimension + title: BenchmarkEmbeddingsResponse + EmbeddingModel: + type: string + enum: + - intfloat/multilingual-e5-large + - text-embedding-ada-002 + - text-embedding-3-small + - text-embedding-3-large + - mock_model + title: EmbeddingModel + description: Supported embedding models. HTTPValidationError: properties: detail: @@ -292,134 +369,38 @@ components: title: Detail type: object title: HTTPValidationError - HyDESearchRequest: + SearchResult: properties: - query_text: - type: string - title: Query Text - top_k: + id: type: integer - title: Top K - default: 10 - type: object - required: - - query_text - title: HyDESearchRequest - description: HyDE search request model. - HybridHyDESearchRequest: - properties: - query_text: + title: Id + xhtml_md: type: string - title: Query Text - top_k: - type: integer - title: Top K - default: 10 - top_k_hyde: - type: integer - title: Top K Hyde - default: 50 - top_k_fts: - type: integer - title: Top K Fts - default: 50 - rrf_k: - type: integer - title: Rrf K - default: 60 - type: object - required: - - query_text - title: HybridHyDESearchRequest - description: Adaptive hybrid search request model. - HybridHyDESearchResults: - properties: + title: Xhtml Md rank: - type: integer - title: Rank - article_id: - type: integer - title: Article Id - article_headword: - type: string - title: Article Headword - chunk_sequence: - type: integer - title: Chunk Sequence - chunk_text: - type: string - title: Chunk Text - rrf_score: type: number - title: Rrf Score - hyde_rank: - anyOf: - - type: integer - - type: 'null' - title: Hyde Rank - fts_rank: + title: Rank + url: anyOf: - - type: integer - - type: 'null' - title: Fts Rank - source: - type: string - title: Source - type: object - required: - - rank - - article_id - - article_headword - - chunk_sequence - - chunk_text - - rrf_score - - hyde_rank - - fts_rank - - source - title: HybridHyDESearchResults - HybridSearchRequest: - properties: - query_text: + - type: string + - type: 'null' + title: Url + title: type: string - title: Query Text - top_k: - type: integer - title: Top K - default: 10 - top_k_semantic: - type: integer - title: Top K Semantic - default: 50 - top_k_fts: - type: integer - title: Top K Fts - default: 50 - rrf_k: - type: integer - title: Rrf K - default: 60 - type: object - required: - - query_text - title: HybridSearchRequest - description: Hybrid search request model. - HybridSearchResults: - properties: - results: - items: - $ref: '#/components/schemas/lex_db__hybrid_search__SearchResult' - type: array - title: Results + title: Title type: object required: - - results - title: HybridSearchResults - description: Results of a hybrid search. + - id + - xhtml_md + - rank + - title + title: SearchResult + description: Single result from a search. SearchResults: properties: entries: items: - $ref: '#/components/schemas/lex_db__database__SearchResult' + $ref: '#/components/schemas/SearchResult' type: array title: Entries total: @@ -430,9 +411,9 @@ components: title: Limit type: object required: - - entries - - total - - limit + - entries + - total + - limit title: SearchResults description: Results of a search. ValidationError: @@ -440,8 +421,8 @@ components: loc: items: anyOf: - - type: string - - type: integer + - type: string + - type: integer type: array title: Location msg: @@ -452,9 +433,9 @@ components: title: Error Type type: object required: - - loc - - msg - - type + - loc + - msg + - type title: ValidationError VectorSearchRequest: properties: @@ -467,7 +448,7 @@ components: default: 5 type: object required: - - query_text + - query_text title: VectorSearchRequest description: Vector search request model. VectorSearchResult: @@ -489,11 +470,11 @@ components: title: Distance type: object required: - - id_in_index - - source_article_id - - chunk_seq - - chunk_text - - distance + - id_in_index + - source_article_id + - chunk_seq + - chunk_text + - distance title: VectorSearchResult description: Result of a vector search. VectorSearchResults: @@ -505,37 +486,48 @@ components: title: Results type: object required: - - results + - results title: VectorSearchResults description: Result of a vector search. - lex_db__database__SearchResult: + HybridSearchRequest: properties: - id: - type: integer - title: Id - xhtml_md: - type: string - title: Xhtml Md - rank: - type: number - title: Rank - url: - anyOf: - - type: string - - type: 'null' - title: Url - title: + query_text: type: string - title: Title + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + top_k_semantic: + type: integer + title: Top K Semantic + default: 50 + top_k_fts: + type: integer + title: Top K Fts + default: 50 + rrf_k: + type: integer + title: Rrf K + default: 60 type: object required: - - id - - xhtml_md - - rank - - title - title: SearchResult - description: Single result from a search. - lex_db__hybrid_search__SearchResult: + - query_text + title: HybridSearchRequest + description: Hybrid search request model. + HybridSearchResults: + properties: + results: + items: + $ref: '#/components/schemas/LexDbHybridSearchSearchResult' + type: array + title: Results + type: object + required: + - results + title: HybridSearchResults + description: Results of a hybrid search. + LexDbHybridSearchSearchResult: properties: rank: type: integer @@ -557,26 +549,64 @@ components: title: Rrf Score semantic_rank: anyOf: - - type: integer - - type: 'null' + - type: integer + - type: 'null' title: Semantic Rank fts_rank: anyOf: - - type: integer - - type: 'null' + - type: integer + - type: 'null' title: Fts Rank source: type: string title: Source type: object required: - - rank - - article_id - - article_headword - - chunk_sequence - - chunk_text - - rrf_score - - semantic_rank - - fts_rank - - source - title: SearchResult \ No newline at end of file + - rank + - article_id + - article_headword + - chunk_sequence + - chunk_text + - rrf_score + - source + title: LexDbHybridSearchSearchResult + HyDESearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + type: object + required: + - query_text + title: HyDESearchRequest + description: HyDE search request model. + HybridHyDESearchRequest: + properties: + query_text: + type: string + title: Query Text + top_k: + type: integer + title: Top K + default: 10 + top_k_hyde: + type: integer + title: Top K Hyde + default: 50 + top_k_fts: + type: integer + title: Top K Fts + default: 50 + rrf_k: + type: integer + title: Rrf K + default: 60 + type: object + required: + - query_text + title: HybridHyDESearchRequest + description: Adaptive hybrid search request model. From 6b56858d45f8c59ddf3b4ff447392b4b82a0c988 Mon Sep 17 00:00:00 2001 From: Zafar Date: Tue, 16 Dec 2025 17:21:40 +0100 Subject: [PATCH 10/12] Update lex-db.yaml --- openapi/lex-db.yaml | 111 -------------------------------------------- 1 file changed, 111 deletions(-) diff --git a/openapi/lex-db.yaml b/openapi/lex-db.yaml index fc1f70c..639fd78 100644 --- a/openapi/lex-db.yaml +++ b/openapi/lex-db.yaml @@ -106,77 +106,6 @@ paths: application/json: schema: $ref: '#/components/schemas/HTTPValidationError' - /api/hyde-search/indexes/{index_name}/query: - post: - tags: - - lex-db - summary: Perform HyDE (Hypothetical Document Embeddings) search - description: Perform HyDE search via the lex-db API. - operationId: hyde_search - parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/HyDESearchRequest' - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: array - items: - type: object - additionalProperties: true - '422': - description: Validation Error - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - /api/hybrid-hyde-search/indexes/{index_name}/query: - post: - tags: - - lex-db - summary: Perform hybrid HyDE search with adaptive RRF weighting - description: Perform hybrid HyDE search combining HyDE and full-text search. - operationId: hybrid_hyde_search - parameters: - - name: index_name - in: path - required: true - schema: - type: string - title: Index Name - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/HybridHyDESearchRequest' - responses: - '200': - description: Successful Response - content: - application/json: - schema: - type: array - items: - $ref: '#/components/schemas/LexDbHybridSearchSearchResult' - '422': - description: Validation Error - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' /api/articles: get: tags: @@ -570,43 +499,3 @@ components: - rrf_score - source title: LexDbHybridSearchSearchResult - HyDESearchRequest: - properties: - query_text: - type: string - title: Query Text - top_k: - type: integer - title: Top K - default: 10 - type: object - required: - - query_text - title: HyDESearchRequest - description: HyDE search request model. - HybridHyDESearchRequest: - properties: - query_text: - type: string - title: Query Text - top_k: - type: integer - title: Top K - default: 10 - top_k_hyde: - type: integer - title: Top K Hyde - default: 50 - top_k_fts: - type: integer - title: Top K Fts - default: 50 - rrf_k: - type: integer - title: Rrf K - default: 60 - type: object - required: - - query_text - title: HybridHyDESearchRequest - description: Adaptive hybrid search request model. From 4f8c573c937aeef53d02a68f9f9495e7f11db25d Mon Sep 17 00:00:00 2001 From: Zafar Date: Tue, 16 Dec 2025 17:28:20 +0100 Subject: [PATCH 11/12] Update lex_db_connector.py --- .../api/connectors/lex_db_connector.py | 72 ------------------- 1 file changed, 72 deletions(-) diff --git a/src/lex_llm/api/connectors/lex_db_connector.py b/src/lex_llm/api/connectors/lex_db_connector.py index 4e2d45b..fe24cd8 100644 --- a/src/lex_llm/api/connectors/lex_db_connector.py +++ b/src/lex_llm/api/connectors/lex_db_connector.py @@ -8,9 +8,6 @@ from lex_db_api.configuration import Configuration from lex_db_api.models.vector_search_request import VectorSearchRequest from lex_db_api.models.hybrid_search_request import HybridSearchRequest -from lex_db_api.models.hy_de_search_request import HyDESearchRequest -from lex_db_api.models.hybrid_hy_de_search_request import HybridHyDESearchRequest - lexdb_client = ApiClient( configuration=Configuration(host=os.getenv("DB_HOST", "http://localhost:8000")) @@ -101,72 +98,3 @@ async def hybrid_search( except httpx.RequestError as e: print(f"Error connecting to LexDB: {e}") return [] - - async def hyde_search( - self, - query: str, - top_k: int = 10, - index_name: str = "article_embeddings_e5", - ) -> List[LexArticle]: - """Performs HyDE search via the lex-db API.""" - - try: - hyde_req = HyDESearchRequest(query_text=query, top_k=top_k) - - hyde_search_result = lexdb_api.hyde_search(index_name, hyde_req) - - if hyde_search_result: - return [ - LexArticle( - id=result["article_id"], - title=result["headword"], - text=result["text"], - url=f"https://lex.dk/{result['headword']}", - ) - for result in hyde_search_result - ] - - return [] - except httpx.RequestError as e: - print(f"Error connecting to LexDB: {e}") - return [] - - async def hybrid_hyde_search( - self, - query: str, - top_k: int = 10, - top_k_hyde: int = 50, - top_k_fts: int = 50, - rrf_k: int = 60, - index_name: str = "article_embeddings_e5", - ) -> List[LexArticle]: - """Performs hybrid HyDE search with adaptive RRF weighting.""" - - try: - hybrid_hyde_req = HybridHyDESearchRequest( - query_text=query, - top_k=top_k, - top_k_hyde=top_k_hyde, - top_k_fts=top_k_fts, - rrf_k=rrf_k, - ) - - hybrid_hyde_result = lexdb_api.hybrid_hyde_search( - index_name, hybrid_hyde_req - ) - - if hybrid_hyde_result: - return [ - LexArticle( - id=result.article_id, - title=result.article_headword, - text=result.chunk_text, - url=f"https://lex.dk/{result.article_headword}", - ) - for result in hybrid_hyde_result - ] - - return [] - except httpx.RequestError as e: - print(f"Error connecting to LexDB: {e}") - return [] From 59c8af1d984d324f1075abb6fbf16157ee76dcbf Mon Sep 17 00:00:00 2001 From: Zafar Date: Tue, 16 Dec 2025 17:29:29 +0100 Subject: [PATCH 12/12] Simplify search method options in search_knowledge_base Removed unused search methods from the search function. --- src/lex_llm/tools/search_knowledge_base.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/lex_llm/tools/search_knowledge_base.py b/src/lex_llm/tools/search_knowledge_base.py index 15f8b17..2399844 100644 --- a/src/lex_llm/tools/search_knowledge_base.py +++ b/src/lex_llm/tools/search_knowledge_base.py @@ -16,7 +16,7 @@ def search_knowledge_base( Args: index_name: The name of the vector index to search top_k: Number of top results to retrieve - search_method: Search method to use - one of: "vector_search", "hybrid_search", "hyde_search", "hybrid_hyde_search" + search_method: Search method to use - one of: "vector_search", "hybrid_search" Returns: An async generator function compatible with the Orchestrator """ @@ -32,14 +32,6 @@ async def search_knowledge_base( documents = await lex_db_connector.hybrid_search( query=user_input, top_k=top_k, index_name=index_name ) - elif search_method == "hyde_search": - documents = await lex_db_connector.hyde_search( - query=user_input, top_k=top_k, index_name=index_name - ) - elif search_method == "hybrid_hyde_search": - documents = await lex_db_connector.hybrid_hyde_search( - query=user_input, top_k=top_k, index_name=index_name - ) else: # Default to vector_search documents = await lex_db_connector.vector_search( query=user_input, top_k=top_k, index_name=index_name