microsoft
diff --git a/‎.semversioner/next-release/patch-20241127084633163555.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20241127084633163555.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/config/env_vars.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/config/env_vars.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/config/yaml.md‎
Lines changed: 7 additions & 7 deletions b/‎docs/config/yaml.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/index/default_dataflow.md‎
Lines changed: 6 additions & 6 deletions b/‎docs/index/default_dataflow.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/use_built_in_workflows/run.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/use_built_in_workflows/run.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphrag/api/index.py‎
Lines changed: 1 addition & 11 deletions b/‎graphrag/api/index.py‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎graphrag/api/query.py‎
Lines changed: 1 addition & 1 deletion b/‎graphrag/api/query.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphrag/index/storage/__init__.py‎ ‎graphrag/cache/__init__.py‎graphrag/index/storage/__init__.py renamed to graphrag/cache/__init__.py
Lines changed: 1 addition & 1 deletion b/‎graphrag/index/storage/__init__.py‎ ‎graphrag/cache/__init__.py‎graphrag/index/storage/__init__.py renamed to graphrag/cache/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphrag/index/cache/load_cache.py‎ ‎graphrag/cache/factory.py‎graphrag/index/cache/load_cache.py renamed to graphrag/cache/factory.py
Lines changed: 12 additions & 9 deletions b/‎graphrag/index/cache/load_cache.py‎ ‎graphrag/cache/factory.py‎graphrag/index/cache/load_cache.py renamed to graphrag/cache/factory.py
Lines changed: 12 additions & 9 deletions
diff --git a/‎…phrag/index/cache/json_pipeline_cache.py‎ ‎graphrag/cache/json_pipeline_cache.py‎graphrag/index/cache/json_pipeline_cache.py renamed to graphrag/cache/json_pipeline_cache.py
Lines changed: 2 additions & 2 deletions b/‎…phrag/index/cache/json_pipeline_cache.py‎ ‎graphrag/cache/json_pipeline_cache.py‎graphrag/index/cache/json_pipeline_cache.py renamed to graphrag/cache/json_pipeline_cache.py
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "miscellaneous code cleanup and minor changes for better alignment of style across the codebase."
+}
@@ -2,7 +2,7 @@
 
 ## Text-Embeddings Customization
 
-By default, the GraphRAG indexer will only emit embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the `GRAPHRAG_EMBEDDING_TARGET` environment variable to `all`.
+By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the `GRAPHRAG_EMBEDDING_TARGET` environment variable to `all`.
 
 If the embedding target is `all`, and you want to only embed a subset of these fields, you may specify which embeddings to skip using the `GRAPHRAG_EMBEDDING_SKIP` argument described below.
 
@@ -152,7 +152,7 @@ These settings control the data input used by the pipeline. Any settings with a
 
 ## Storage
 
-This section controls the storage mechanism used by the pipeline used for emitting output tables.
+This section controls the storage mechanism used by the pipeline used for exporting output tables.
 
 | Parameter                                   | Description                                                                                                                                                        | Type  | Required or Optional | Default |
 | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ------- |
 
@@ -67,7 +67,7 @@ This is the base LLM configuration section. Other steps may override this config
 - `async_mode` (see Async Mode top-level config)
 - `batch_size` **int** - The maximum batch size to use.
 - `batch_max_tokens` **int** - The maximum batch # of tokens.
-- `target` **required|all|none** - Determines which set of embeddings to emit.
+- `target` **required|all|none** - Determines which set of embeddings to export.
 - `skip` **list[str]** - Which embeddings to skip. Only useful if target=all to customize the list.
 - `vector_store` **dict** - The vector store to use. Configured for lancedb by default.
     - `type` **str** - `lancedb` or `azure_ai_search`. Default=`lancedb`
@@ -203,7 +203,7 @@ This is the base LLM configuration section. Other steps may override this config
 
 #### Fields
 
-- `max_cluster_size` **int** - The maximum cluster size to emit.
+- `max_cluster_size` **int** - The maximum cluster size to export.
 - `strategy` **dict** - Fully override the cluster_graph strategy.
 
 ### embed_graph
@@ -228,11 +228,11 @@ This is the base LLM configuration section. Other steps may override this config
 
 #### Fields
 
-- `embeddings` **bool** - Emit embeddings snapshots to parquet.
-- `graphml` **bool** - Emit graph snapshots to GraphML.
-- `raw_entities` **bool** - Emit raw entity snapshots to JSON.
-- `top_level_nodes` **bool** - Emit top-level-node snapshots to JSON.
-- `transient` **bool** - Emit transient workflow tables snapshots to parquet.
+- `embeddings` **bool** - Export embeddings snapshots to parquet.
+- `graphml` **bool** - Export graph snapshots to GraphML.
+- `raw_entities` **bool** - Export raw entity snapshots to JSON.
+- `top_level_nodes` **bool** - Export top-level-node snapshots to JSON.
+- `transient` **bool** - Export transient workflow tables snapshots to parquet.
 
 ### encoding_model
 
 
@@ -105,9 +105,9 @@ Now that we have a graph of entities and relationships, each with a list of desc
 
 ### Claim Extraction & Emission
 
-Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These are emitted as a primary artifact called **Covariates**.
+Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These get exported as a primary artifact called **Covariates**.
 
-Note: claim extraction is _optional_ and turned off by default. This is because claim extraction generally needs prompt tuning to be useful.
+Note: claim extraction is _optional_ and turned off by default. This is because claim extraction generally requires prompt tuning to be useful.
 
 ## Phase 3: Graph Augmentation
 
@@ -131,7 +131,7 @@ In this step, we generate a vector representation of our graph using the Node2Ve
 
 ### Graph Tables Emission
 
-Once our graph augmentation steps are complete, the final **Entities** and **Relationships** tables are emitted after their text fields are text-embedded.
+Once our graph augmentation steps are complete, the final **Entities** and **Relationships** tables are exported after their text fields are text-embedded.
 
 ## Phase 4: Community Summarization
 
@@ -161,7 +161,7 @@ In this step, we generate a vector representation of our communities by generati
 
 ### Community Tables Emission
 
-At this point, some bookkeeping work is performed and we emit the **Communities** and **CommunityReports** tables.
+At this point, some bookkeeping work is performed and we export the **Communities** and **CommunityReports** tables.
 
 ## Phase 5: Document Processing
 
@@ -189,7 +189,7 @@ In this step, we generate a vector representation of our documents using an aver
 
 ### Documents Table Emission
 
-At this point, we can emit the **Documents** table into the knowledge Model.
+At this point, we can export the **Documents** table into the knowledge Model.
 
 ## Phase 6: Network Visualization
 
@@ -203,4 +203,4 @@ flowchart LR
     nv[Umap Documents] --> ne[Umap Entities] --> ng[Nodes Table Emission]
 ```
 
-For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then emitted as a table of _Nodes_. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates.
+For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then exported as a table of _Nodes_. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates.
@@ -5,7 +5,7 @@
 
 from graphrag.index.config.input import PipelineCSVInputConfig
 from graphrag.index.config.workflow import PipelineWorkflowReference
-from graphrag.index.input.load_input import load_input
+from graphrag.index.input.factory import create_input
 from graphrag.index.run import run_pipeline, run_pipeline_with_config
 
 sample_data_dir = os.path.join(
@@ -14,7 +14,7 @@
 
 # Load our dataset once
 shared_dataset = asyncio.run(
-    load_input(
+    create_input(
         PipelineCSVInputConfig(
             file_pattern=".*\\.csv$",
             base_dir=sample_data_dir,
 
@@ -10,11 +10,10 @@
 
 from pathlib import Path
 
+from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
 from graphrag.config.enums import CacheType
 from graphrag.config.models.graph_rag_config import GraphRagConfig
-from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache
 from graphrag.index.create_pipeline_config import create_pipeline_config
-from graphrag.index.emit.types import TableEmitterType
 from graphrag.index.run import run_pipeline_with_config
 from graphrag.index.typing import PipelineRunResult
 from graphrag.logging.base import ProgressReporter
@@ -27,7 +26,6 @@ async def build_index(
     is_resume_run: bool = False,
     memory_profile: bool = False,
     progress_reporter: ProgressReporter | None = None,
-    emit: list[TableEmitterType] = [TableEmitterType.Parquet],  # noqa: B006
 ) -> list[PipelineRunResult]:
     """Run the pipeline with the given configuration.
 
@@ -45,9 +43,6 @@ async def build_index(
         Whether to enable memory profiling.
     progress_reporter : ProgressReporter | None default=None
         The progress reporter.
-    emit : list[str]
-        The list of emitter types to emit.
-        Accepted values {"parquet", "csv"}.
 
     Returns
     -------
@@ -60,10 +55,6 @@ async def build_index(
         msg = "Cannot resume and update a run at the same time."
         raise ValueError(msg)
 
-    # Ensure Parquet is part of the emitters
-    if TableEmitterType.Parquet not in emit:
-        emit.append(TableEmitterType.Parquet)
-
     config = _patch_vector_config(config)
 
     pipeline_config = create_pipeline_config(config)
@@ -77,7 +68,6 @@ async def build_index(
         memory_profile=memory_profile,
         cache=pipeline_cache,
         progress_reporter=progress_reporter,
-        emit=emit,
         is_resume_run=is_resume_run,
         is_update_run=is_update_run,
     ):
 
@@ -30,7 +30,7 @@
     entity_description_embedding,
 )
 from graphrag.logging.print_progress import PrintProgressReporter
-from graphrag.query.factories import (
+from graphrag.query.factory import (
     get_drift_search_engine,
     get_global_search_engine,
     get_local_search_engine,
 
@@ -1,4 +1,4 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License
 
-"""The Indexing Engine storage package root."""
+"""A package containing cache implementations."""
@@ -1,7 +1,7 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License
 
-"""A module containing load_cache method definition."""
+"""A module containing create_cache method definition."""
 
 from __future__ import annotations
 
@@ -12,29 +12,32 @@
     PipelineBlobCacheConfig,
     PipelineFileCacheConfig,
 )
-from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage
-from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage
+from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage
+from graphrag.storage.file_pipeline_storage import FilePipelineStorage
 
 if TYPE_CHECKING:
+    from graphrag.cache.pipeline_cache import PipelineCache
     from graphrag.index.config.cache import (
         PipelineCacheConfig,
     )
 
-from graphrag.index.cache.json_pipeline_cache import JsonPipelineCache
-from graphrag.index.cache.memory_pipeline_cache import create_memory_cache
-from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache
+from graphrag.cache.json_pipeline_cache import JsonPipelineCache
+from graphrag.cache.memory_pipeline_cache import InMemoryCache
+from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
 
 
-def load_cache(config: PipelineCacheConfig | None, root_dir: str | None):
-    """Load the cache from the given config."""
+def create_cache(
+    config: PipelineCacheConfig | None, root_dir: str | None
+) -> PipelineCache:
+    """Create a cache from the given config."""
     if config is None:
         return NoopPipelineCache()
 
     match config.type:
         case CacheType.none:
             return NoopPipelineCache()
         case CacheType.memory:
-            return create_memory_cache()
+            return InMemoryCache()
         case CacheType.file:
             config = cast(PipelineFileCacheConfig, config)
             storage = FilePipelineStorage(root_dir).child(config.base_dir)
 
@@ -6,8 +6,8 @@
 import json
 from typing import Any
 
-from graphrag.index.cache.pipeline_cache import PipelineCache
-from graphrag.index.storage.pipeline_storage import PipelineStorage
+from graphrag.cache.pipeline_cache import PipelineCache
+from graphrag.storage.pipeline_storage import PipelineStorage
 
 
 class JsonPipelineCache(PipelineCache):
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "miscellaneous code cleanup and minor changes for better alignment of style across the codebase."
 +}
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@`
`30`	`30`	`entity_description_embedding,`
`31`	`31`	`)`
`32`	`32`	`from graphrag.logging.print_progress import PrintProgressReporter`
`33`		`-from graphrag.query.factories import (`
	`33`	`+from graphrag.query.factory import (`
`34`	`34`	`get_drift_search_engine,`
`35`	`35`	`get_global_search_engine,`
`36`	`36`	`get_local_search_engine,`