move vector_store from embeddings to top level of config and delete resolve_paths

dworthen · dworthen · commit 7a974bf8eb77 · 2025-01-17T07:26:33.000-08:00
diff --git a/graphrag/api/query.py b/graphrag/api/query.py
@@ -244,7 +244,7 @@ async def local_search(
     ------
     TODO: Document any exceptions to expect.
     """
-    vector_store_args = config.embeddings.vector_store
+    vector_store_args = config.vector_store
     logger.info(f"Vector Store Args: {redact(vector_store_args)}")  # type: ignore # noqa
 
     description_embedding_store = _get_embedding_store(
@@ -310,7 +310,7 @@ async def local_search_streaming(
     ------
     TODO: Document any exceptions to expect.
     """
-    vector_store_args = config.embeddings.vector_store
+    vector_store_args = config.vector_store
     logger.info(f"Vector Store Args: {redact(vector_store_args)}")  # type: ignore # noqa
 
     description_embedding_store = _get_embedding_store(
@@ -381,7 +381,7 @@ async def drift_search_streaming(
     ------
     TODO: Document any exceptions to expect.
     """
-    vector_store_args = config.embeddings.vector_store
+    vector_store_args = config.vector_store
     logger.info(f"Vector Store Args: {redact(vector_store_args)}")  # type: ignore # noqa
 
     description_embedding_store = _get_embedding_store(
@@ -465,7 +465,7 @@ async def drift_search(
     ------
     TODO: Document any exceptions to expect.
     """
-    vector_store_args = config.embeddings.vector_store
+    vector_store_args = config.vector_store
     logger.info(f"Vector Store Args: {redact(vector_store_args)}")  # type: ignore # noqa
 
     description_embedding_store = _get_embedding_store(
@@ -531,7 +531,7 @@ async def basic_search(
     ------
     TODO: Document any exceptions to expect.
     """
-    vector_store_args = config.embeddings.vector_store
+    vector_store_args = config.vector_store
     logger.info(f"Vector Store Args: {redact(vector_store_args)}")  # type: ignore # noqa
 
     description_embedding_store = _get_embedding_store(
@@ -576,7 +576,7 @@ async def basic_search_streaming(
     ------
     TODO: Document any exceptions to expect.
     """
-    vector_store_args = config.embeddings.vector_store
+    vector_store_args = config.vector_store
     logger.info(f"Vector Store Args: {redact(vector_store_args)}")  # type: ignore # noqa
 
     description_embedding_store = _get_embedding_store(
diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py
@@ -14,7 +14,6 @@
 from graphrag.config.enums import CacheType
 from graphrag.config.load_config import load_config
 from graphrag.config.logging import enable_logging_with_config
-from graphrag.config.resolve_path import resolve_paths
 from graphrag.index.validate_config import validate_config_names
 from graphrag.logger.base import ProgressLogger
 from graphrag.logger.factory import LoggerFactory, LoggerType
@@ -146,7 +145,6 @@ def _run_index(
     config.reporting.base_dir = (
         str(output_dir) if output_dir else config.reporting.base_dir
     )
-    resolve_paths(config, run_id)
 
     if not cache:
         config.cache.type = CacheType.none
diff --git a/graphrag/cli/query.py b/graphrag/cli/query.py
@@ -12,7 +12,6 @@
 import graphrag.api as api
 from graphrag.config.load_config import load_config
 from graphrag.config.models.graph_rag_config import GraphRagConfig
-from graphrag.config.resolve_path import resolve_paths
 from graphrag.logger.print_progress import PrintProgressLogger
 from graphrag.storage.factory import StorageFactory
 from graphrag.utils.storage import load_table_from_storage, storage_has_table
@@ -37,7 +36,6 @@ def run_global_search(
     root = root_dir.resolve()
     config = load_config(root, config_filepath)
     config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir
-    resolve_paths(config)
 
     dataframe_dict = _resolve_output_files(
         config=config,
@@ -121,7 +119,6 @@ def run_local_search(
     root = root_dir.resolve()
     config = load_config(root, config_filepath)
     config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir
-    resolve_paths(config)
 
     dataframe_dict = _resolve_output_files(
         config=config,
@@ -212,7 +209,6 @@ def run_drift_search(
     root = root_dir.resolve()
     config = load_config(root, config_filepath)
     config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir
-    resolve_paths(config)
 
     dataframe_dict = _resolve_output_files(
         config=config,
@@ -297,7 +293,6 @@ def run_basic_search(
     root = root_dir.resolve()
     config = load_config(root, config_filepath)
     config.storage.base_dir = str(data_dir) if data_dir else config.storage.base_dir
-    resolve_paths(config)
 
     dataframe_dict = _resolve_output_files(
         config=config,
diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
@@ -101,19 +101,11 @@
 UMAP_ENABLED = False
 UPDATE_STORAGE_BASE_DIR = "update_output"
 
-VECTOR_STORE = f"""
-    type: {VectorStoreType.LanceDB.value} # one of [lancedb, azure_ai_search, cosmosdb]
-    db_uri: '{(Path(STORAGE_BASE_DIR) / "lancedb")!s}'
-    collection_name: default
-    overwrite: true\
-"""
-
-VECTOR_STORE_DICT = {
-    "type": VectorStoreType.LanceDB.value,
-    "db_uri": str(Path(STORAGE_BASE_DIR) / "lancedb"),
-    "collection_name": "default",
-    "overwrite": True,
-}
+
+VECTOR_STORE_TYPE = VectorStoreType.LanceDB
+VECTOR_STORE_DB_URI = str(Path(STORAGE_BASE_DIR) / "lancedb")
+VECTOR_STORE_COLLECTION_NAME = "default"
+VECTOR_STORE_OVERWRITE = True
 
 # Local Search
 LOCAL_SEARCH_TEXT_UNIT_PROP = 0.5
diff --git a/graphrag/config/embeddings.py b/graphrag/config/embeddings.py
@@ -57,7 +57,7 @@ def get_embedding_settings(
     embeddings_llm_settings = settings.get_language_model_config(
         settings.embeddings.model_id
     )
-    vector_store_settings = settings.embeddings.vector_store
+    vector_store_settings = settings.vector_store
     if vector_store_settings is None:
         return {
             "strategy": settings.embeddings.resolved_strategy(embeddings_llm_settings)
@@ -71,7 +71,10 @@ def get_embedding_settings(
         embeddings_llm_settings
     )  # get the default strategy
     strategy.update({
-        "vector_store": {**(vector_store_params or {}), **vector_store_settings}
+        "vector_store": {
+            **(vector_store_params or {}),
+            **(vector_store_settings.model_dump()),
+        }
     })  # update the default strategy with the vector store settings
     # This ensures the vector store config is part of the strategy and not the global config
     return {
diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py
@@ -39,9 +39,13 @@
     # organization: <organization_id>
     # deployment_name: <azure_model_deployment_name>
 
+vector_store:
+    type: {defs.VECTOR_STORE_TYPE.value}
+    db_uri: {defs.VECTOR_STORE_DB_URI}
+    collection_name: {defs.VECTOR_STORE_COLLECTION_NAME}
+    overwrite: {defs.VECTOR_STORE_OVERWRITE}
+
 embeddings:
-  async_mode: {defs.ASYNC_MODE.value} # or asyncio
-  vector_store: {defs.VECTOR_STORE}
   model_id: {defs.DEFAULT_EMBEDDING_MODEL_ID}
 
 ### Input settings ###
@@ -83,28 +87,28 @@
 ### Workflow settings ###
 
 entity_extraction:
+  model_id: {defs.ENTITY_EXTRACTION_MODEL_ID}
   prompt: "prompts/entity_extraction.txt"
   entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}]
   max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS}
-  model_id: {defs.ENTITY_EXTRACTION_MODEL_ID}
 
 summarize_descriptions:
+  model_id: {defs.SUMMARIZE_MODEL_ID}
   prompt: "prompts/summarize_descriptions.txt"
   max_length: {defs.SUMMARIZE_DESCRIPTIONS_MAX_LENGTH}
-  model_id: {defs.SUMMARIZE_MODEL_ID}
 
 claim_extraction:
   enabled: false
+  model_id: {defs.CLAIM_EXTRACTION_MODEL_ID}
   prompt: "prompts/claim_extraction.txt"
   description: "{defs.CLAIM_DESCRIPTION}"
   max_gleanings: {defs.CLAIM_MAX_GLEANINGS}
-  model_id: {defs.CLAIM_EXTRACTION_MODEL_ID}
 
 community_reports:
+  model_id: {defs.COMMUNITY_REPORT_MODEL_ID}
   prompt: "prompts/community_report.txt"
   max_length: {defs.COMMUNITY_REPORT_MAX_LENGTH}
   max_input_length: {defs.COMMUNITY_REPORT_MAX_INPUT_LENGTH}
-  model_id: {defs.COMMUNITY_REPORT_MODEL_ID}
 
 cluster_graph:
   max_cluster_size: {defs.MAX_CLUSTER_SIZE}
diff --git a/graphrag/config/models/graph_rag_config.py b/graphrag/config/models/graph_rag_config.py
@@ -31,6 +31,8 @@
 )
 from graphrag.config.models.text_embedding_config import TextEmbeddingConfig
 from graphrag.config.models.umap_config import UmapConfig
+from graphrag.config.models.vector_store_config import VectorStoreConfig
+from graphrag.vector_stores.factory import VectorStoreType
 
 
 class GraphRagConfig(BaseModel):
@@ -51,11 +53,13 @@ def __str__(self):
     def _validate_root_dir(self) -> None:
         """Validate the root directory."""
         if self.root_dir.strip() == "":
-            self.root_dir = str(Path.cwd().resolve())
+            self.root_dir = str(Path.cwd())
 
-        if not Path(self.root_dir).is_dir():
+        root_dir = Path(self.root_dir).resolve()
+        if not root_dir.is_dir():
             msg = f"Invalid root directory: {self.root_dir} is not a directory."
             raise FileNotFoundError(msg)
+        self.root_dir = str(root_dir)
 
     models: dict[str, LanguageModelConfig] = Field(
         description="Available language model configurations.",
@@ -85,17 +89,50 @@ def _validate_models(self) -> None:
     )
     """The reporting configuration."""
 
+    def _validate_reporting_base_dir(self) -> None:
+        """Validate the reporting base directory."""
+        if self.reporting.type == defs.ReportingType.file:
+            if self.reporting.base_dir.strip() == "":
+                msg = "Reporting base directory is required for file reporting. Please rerun `graphrag init` and set the reporting configuration."
+                raise ValueError(msg)
+            self.reporting.base_dir = str(
+                (Path(self.root_dir) / self.reporting.base_dir).resolve()
+            )
+
     storage: StorageConfig = Field(
         description="The storage configuration.", default=StorageConfig()
     )
     """The storage configuration."""
 
+    def _validate_storage_base_dir(self) -> None:
+        """Validate the storage base directory."""
+        if self.storage.type == defs.StorageType.file:
+            if self.storage.base_dir.strip() == "":
+                msg = "Storage base directory is required for file storage. Please rerun `graphrag init` and set the storage configuration."
+                raise ValueError(msg)
+            self.storage.base_dir = str(
+                (Path(self.root_dir) / self.storage.base_dir).resolve()
+            )
+
     update_index_storage: StorageConfig | None = Field(
         description="The storage configuration for the updated index.",
         default=None,
     )
     """The storage configuration for the updated index."""
 
+    def _validate_update_index_storage_base_dir(self) -> None:
+        """Validate the update index storage base directory."""
+        if (
+            self.update_index_storage
+            and self.update_index_storage.type == defs.StorageType.file
+        ):
+            if self.update_index_storage.base_dir.strip() == "":
+                msg = "Update index storage base directory is required for file storage. Please rerun `graphrag init` and set the update index storage configuration."
+                raise ValueError(msg)
+            self.update_index_storage.base_dir = str(
+                (Path(self.root_dir) / self.update_index_storage.base_dir).resolve()
+            )
+
     cache: CacheConfig = Field(
         description="The cache configuration.", default=CacheConfig()
     )
@@ -187,6 +224,21 @@ def _validate_models(self) -> None:
     )
     """The basic search configuration."""
 
+    vector_store: VectorStoreConfig = Field(
+        description="The vector store configuration.", default=VectorStoreConfig()
+    )
+    """The vector store configuration."""
+
+    def _validate_vector_store_db_uri(self) -> None:
+        """Validate the vector store configuration."""
+        if self.vector_store.type == VectorStoreType.LanceDB.value:
+            if self.vector_store.db_uri.strip == "":
+                msg = "Vector store URI is required for LanceDB. Please rerun `graphrag init` and set the vector store configuration."
+                raise ValueError(msg)
+            self.vector_store.db_uri = str(
+                (Path(self.root_dir) / self.vector_store.db_uri).resolve()
+            )
+
     def get_language_model_config(self, model_id: str) -> LanguageModelConfig:
         """Get a model configuration by ID.
 
@@ -216,4 +268,8 @@ def _validate_model(self):
         """Validate the model configuration."""
         self._validate_root_dir()
         self._validate_models()
+        self._validate_reporting_base_dir()
+        self._validate_storage_base_dir()
+        self._validate_update_index_storage_base_dir()
+        self._validate_vector_store_db_uri()
         return self
diff --git a/graphrag/config/models/text_embedding_config.py b/graphrag/config/models/text_embedding_config.py
@@ -27,9 +27,6 @@ class TextEmbeddingConfig(BaseModel):
     names: list[str] = Field(
         description="The specific embeddings to perform.", default=[]
     )
-    vector_store: dict = Field(
-        description="The vector storage configuration", default=defs.VECTOR_STORE_DICT
-    )
     strategy: dict | None = Field(
         description="The override strategy to use.", default=None
     )
diff --git a/graphrag/config/models/vector_store_config.py b/graphrag/config/models/vector_store_config.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Parameterization settings for the default configuration."""
+
+from pydantic import BaseModel, Field
+
+import graphrag.config.defaults as defs
+
+
+class VectorStoreConfig(BaseModel):
+    """The default configuration section for Vector Store."""
+
+    type: str = Field(
+        description="The vector store type to use.", default=defs.VECTOR_STORE_TYPE
+    )
+
+    db_uri: str = Field(
+        description="The database URI to use.", default=defs.VECTOR_STORE_DB_URI
+    )
+
+    collection_name: str = Field(
+        description="The database name to use.",
+        default=defs.VECTOR_STORE_COLLECTION_NAME,
+    )
+
+    overwrite: bool = Field(
+        description="Overwrite the existing data.", default=defs.VECTOR_STORE_OVERWRITE
+    )
diff --git a/graphrag/config/resolve_path.py b/graphrag/config/resolve_path.py
diff --git a/tests/unit/config/test_config.py b/tests/unit/config/test_config.py
diff --git a/tests/unit/config/test_resolve_path.py b/tests/unit/config/test_resolve_path.py
diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py

Original file line number	Diff line number	Diff line change
`@@ -27,9 +27,6 @@ class TextEmbeddingConfig(BaseModel):`
`27`	`27`	`names: list[str] = Field(`
`28`	`28`	`description="The specific embeddings to perform.", default=[]`
`29`	`29`	`)`
`30`		`- vector_store: dict = Field(`
`31`		`- description="The vector storage configuration", default=defs.VECTOR_STORE_DICT`
`32`		`- )`
`33`	`30`	`strategy: dict \| None = Field(`
`34`	`31`	`description="The override strategy to use.", default=None`
`35`	`32`	`)`