microsoft
diff --git a/‎.semversioner/next-release/patch-20250319182609055856.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20250319182609055856.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.semversioner/next-release/patch-20250331184323312702.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20250331184323312702.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎dictionary.txt‎
Lines changed: 5 additions & 0 deletions b/‎dictionary.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎graphrag/api/prompt_tune.py‎
Lines changed: 1 addition & 1 deletion b/‎graphrag/api/prompt_tune.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphrag/prompt_tune/loader/input.py‎
Lines changed: 26 additions & 25 deletions b/‎graphrag/prompt_tune/loader/input.py‎
Lines changed: 26 additions & 25 deletions
diff --git a/‎graphrag/vector_stores/cosmosdb.py‎
Lines changed: 71 additions & 16 deletions b/‎graphrag/vector_stores/cosmosdb.py‎
Lines changed: 71 additions & 16 deletions
diff --git a/‎graphrag/vector_stores/factory.py‎
Lines changed: 2 additions & 2 deletions b/‎graphrag/vector_stores/factory.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/integration/vector_stores/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎tests/integration/vector_stores/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Added batching logic to the prompt tuning autoselection embeddings workflow"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "add vector store integration tests"
+}
@@ -200,3 +200,8 @@ unnavigated
 # Names
 Hochul
 Ashish
+
+#unified-search
+apos
+dearmor
+venv
@@ -111,7 +111,7 @@ async def generate_indexing_prompts(
 
     # if max_retries is not set, inject a dynamically assigned value based on the number of expected LLM calls
     # to be made or fallback to a default value in the worst case
-    if default_llm_settings.max_retries == -1:
+    if default_llm_settings.max_retries < -1:
         default_llm_settings.max_retries = min(
             len(doc_list), language_model_defaults.max_retries
         )
 
@@ -6,12 +6,14 @@
 import numpy as np
 import pandas as pd
 
+from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
 from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.input.factory import create_input
+from graphrag.index.operations.embed_text.strategies.openai import (
+    run as run_embed_text,
+)
 from graphrag.index.workflows.create_base_text_units import create_base_text_units
-from graphrag.language_model.manager import ModelManager
-from graphrag.language_model.protocol.base import EmbeddingModel
 from graphrag.logger.base import ProgressLogger
 from graphrag.prompt_tune.defaults import (
     LIMIT,
@@ -21,20 +23,9 @@
 from graphrag.prompt_tune.types import DocSelectionType
 
 
-async def _embed_chunks(
-    text_chunks: pd.DataFrame,
-    embedding_llm: EmbeddingModel,
-    n_subset_max: int = N_SUBSET_MAX,
-) -> tuple[pd.DataFrame, np.ndarray]:
-    """Convert text chunks into dense text embeddings."""
-    sampled_text_chunks = text_chunks.sample(n=min(n_subset_max, len(text_chunks)))
-    embeddings = await embedding_llm.aembed_batch(sampled_text_chunks["text"].tolist())
-    return text_chunks, np.array(embeddings)
-
-
 def _sample_chunks_from_embeddings(
     text_chunks: pd.DataFrame,
-    embeddings,
+    embeddings: np.ndarray[float, np.dtype[np.float_]],
     k: int = K,
 ) -> pd.DataFrame:
     """Sample text chunks from embeddings."""
@@ -60,7 +51,6 @@ async def load_docs_in_chunks(
     embeddings_llm_settings = config.get_language_model_config(
         config.embed_text.model_id
     )
-
     dataset = await create_input(config.input, logger, root)
     chunk_config = config.chunks
     chunks_df = create_base_text_units(
@@ -88,18 +78,29 @@ async def load_docs_in_chunks(
         if k is None or k <= 0:
             msg = "k must be an integer > 0"
             raise ValueError(msg)
-        embedding_llm = ModelManager().register_embedding(
-            name="prompt_tuning_embeddings",
-            model_type=embeddings_llm_settings.type,
-            config=embeddings_llm_settings,
-            callbacks=NoopWorkflowCallbacks(),
-            cache=None,
-        )
 
-        chunks_df, embeddings = await _embed_chunks(
-            chunks_df, embedding_llm, n_subset_max=n_subset_max
+        """Convert text chunks into dense text embeddings."""
+        sampled_text_chunks = chunks_df.sample(n=min(n_subset_max, len(chunks_df)))[
+            "text"
+        ].tolist()
+
+        embedding_results = await run_embed_text(
+            sampled_text_chunks,
+            callbacks=NoopWorkflowCallbacks(),
+            cache=NoopPipelineCache(),
+            args={
+                "llm": embeddings_llm_settings.model_dump(),
+                "num_threads": embeddings_llm_settings.concurrent_requests,
+                "batch_size": config.embed_text.batch_size,
+                "batch_max_tokens": config.embed_text.batch_max_tokens,
+            },
         )
+        embeddings = np.array(embedding_results.embeddings)
         chunks_df = _sample_chunks_from_embeddings(chunks_df, embeddings, k=k)
 
     # Convert the dataset to list form, so we have a list of documents
-    return chunks_df["text"].tolist()
+    return [
+        # need this to prevent the str.format() function from breaking when parsing LaTeX from markdown files
+        i.replace("{", "{{").replace("}", "}}")
+        for i in chunks_df["text"]
+    ]
@@ -7,6 +7,7 @@
 from typing import Any
 
 from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy
+from azure.cosmos.exceptions import CosmosHttpResponseError
 from azure.cosmos.partition_key import PartitionKey
 from azure.identity import DefaultAzureCredential
 
@@ -19,7 +20,7 @@
 )
 
 
-class CosmosDBVectoreStore(BaseVectorStore):
+class CosmosDBVectorStore(BaseVectorStore):
     """Azure CosmosDB vector storage implementation."""
 
     _cosmos_client: CosmosClient
@@ -99,16 +100,32 @@ def _create_container(self) -> None:
             "automatic": True,
             "includedPaths": [{"path": "/*"}],
             "excludedPaths": [{"path": "/_etag/?"}, {"path": "/vector/*"}],
-            "vectorIndexes": [{"path": "/vector", "type": "diskANN"}],
         }
 
-        # Create the container and container client
-        self._database_client.create_container_if_not_exists(
-            id=self._container_name,
-            partition_key=partition_key,
-            indexing_policy=indexing_policy,
-            vector_embedding_policy=vector_embedding_policy,
-        )
+        # Currently, the CosmosDB emulator does not support the diskANN policy.
+        try:
+            # First try with the standard diskANN policy
+            indexing_policy["vectorIndexes"] = [{"path": "/vector", "type": "diskANN"}]
+
+            # Create the container and container client
+            self._database_client.create_container_if_not_exists(
+                id=self._container_name,
+                partition_key=partition_key,
+                indexing_policy=indexing_policy,
+                vector_embedding_policy=vector_embedding_policy,
+            )
+        except CosmosHttpResponseError:
+            # If diskANN fails (likely in emulator), retry without vector indexes
+            indexing_policy.pop("vectorIndexes", None)
+
+            # Create the container with compatible indexing policy
+            self._database_client.create_container_if_not_exists(
+                id=self._container_name,
+                partition_key=partition_key,
+                indexing_policy=indexing_policy,
+                vector_embedding_policy=vector_embedding_policy,
+            )
+
         self._container_client = self._database_client.get_container_client(
             self._container_name
         )
@@ -157,13 +174,46 @@ def similarity_search_by_vector(
             msg = "Container client is not initialized."
             raise ValueError(msg)
 
-        query = f"SELECT TOP {k} c.id, c.text, c.vector, c.attributes, VectorDistance(c.vector, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.vector, @embedding)"  # noqa: S608
-        query_params = [{"name": "@embedding", "value": query_embedding}]
-        items = self._container_client.query_items(
-            query=query,
-            parameters=query_params,
-            enable_cross_partition_query=True,
-        )
+        try:
+            query = f"SELECT TOP {k} c.id, c.text, c.vector, c.attributes, VectorDistance(c.vector, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.vector, @embedding)"  # noqa: S608
+            query_params = [{"name": "@embedding", "value": query_embedding}]
+            items = list(
+                self._container_client.query_items(
+                    query=query,
+                    parameters=query_params,
+                    enable_cross_partition_query=True,
+                )
+            )
+        except (CosmosHttpResponseError, ValueError):
+            # Currently, the CosmosDB emulator does not support the VectorDistance function.
+            # For emulator or test environments - fetch all items and calculate distance locally
+            query = "SELECT c.id, c.text, c.vector, c.attributes FROM c"
+            items = list(
+                self._container_client.query_items(
+                    query=query,
+                    enable_cross_partition_query=True,
+                )
+            )
+
+            # Calculate cosine similarity locally (1 - cosine distance)
+            from numpy import dot
+            from numpy.linalg import norm
+
+            def cosine_similarity(a, b):
+                if norm(a) * norm(b) == 0:
+                    return 0.0
+                return dot(a, b) / (norm(a) * norm(b))
+
+            # Calculate scores for all items
+            for item in items:
+                item_vector = item.get("vector", [])
+                similarity = cosine_similarity(query_embedding, item_vector)
+                item["SimilarityScore"] = similarity
+
+            # Sort by similarity score (higher is better) and take top k
+            items = sorted(
+                items, key=lambda x: x.get("SimilarityScore", 0.0), reverse=True
+            )[:k]
 
         return [
             VectorStoreSearchResult(
@@ -214,3 +264,8 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
             text=item.get("text", ""),
             attributes=(json.loads(item.get("attributes", "{}"))),
         )
+
+    def clear(self) -> None:
+        """Clear the vector store."""
+        self._delete_container()
+        self._delete_database()
@@ -8,7 +8,7 @@
 
 from graphrag.vector_stores.azure_ai_search import AzureAISearchVectorStore
 from graphrag.vector_stores.base import BaseVectorStore
-from graphrag.vector_stores.cosmosdb import CosmosDBVectoreStore
+from graphrag.vector_stores.cosmosdb import CosmosDBVectorStore
 from graphrag.vector_stores.lancedb import LanceDBVectorStore
 
 
@@ -44,7 +44,7 @@ def create_vector_store(
             case VectorStoreType.AzureAISearch:
                 return AzureAISearchVectorStore(**kwargs)
             case VectorStoreType.CosmosDB:
-                return CosmosDBVectoreStore(**kwargs)
+                return CosmosDBVectorStore(**kwargs)
             case _:
                 if vector_store_type in cls.vector_store_types:
                     return cls.vector_store_types[vector_store_type](**kwargs)
 
@@ -0,0 +1,4 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Integration tests for vector store implementations."""
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Added batching logic to the prompt tuning autoselection embeddings workflow"
 +}
Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ async def generate_indexing_prompts(`
`111`	`111`
`112`	`112`	`# if max_retries is not set, inject a dynamically assigned value based on the number of expected LLM calls`
`113`	`113`	`# to be made or fallback to a default value in the worst case`
`114`		`- if default_llm_settings.max_retries == -1:`
	`114`	`+ if default_llm_settings.max_retries < -1:`
`115`	`115`	`default_llm_settings.max_retries = min(`
`116`	`116`	`len(doc_list), language_model_defaults.max_retries`
`117`	`117`	`)`