microsoft
diff --git a/‎.semversioner/next-release/patch-20241105004012425642.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20241105004012425642.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/config/env_vars.md‎
Lines changed: 7 additions & 5 deletions b/‎docs/config/env_vars.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎docs/config/json_yaml.md‎
Lines changed: 5 additions & 3 deletions b/‎docs/config/json_yaml.md‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎graphrag/config/create_graphrag_config.py‎
Lines changed: 1 addition & 0 deletions b/‎graphrag/config/create_graphrag_config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎graphrag/config/defaults.py‎
Lines changed: 1 addition & 0 deletions b/‎graphrag/config/defaults.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎graphrag/config/input_models/snapshots_config_input.py‎
Lines changed: 2 additions & 0 deletions b/‎graphrag/config/input_models/snapshots_config_input.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphrag/config/models/snapshots_config.py‎
Lines changed: 7 additions & 3 deletions b/‎graphrag/config/models/snapshots_config.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎graphrag/index/create_pipeline_config.py‎
Lines changed: 4 additions & 1 deletion b/‎graphrag/index/create_pipeline_config.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎graphrag/index/flows/create_base_entity_graph.py‎
Lines changed: 16 additions & 5 deletions b/‎graphrag/index/flows/create_base_entity_graph.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎graphrag/index/flows/create_base_text_units.py‎
Lines changed: 16 additions & 2 deletions b/‎graphrag/index/flows/create_base_text_units.py‎
Lines changed: 16 additions & 2 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Transient entity graph and snapshotting."
+}
@@ -199,11 +199,13 @@ This section controls the reporting mechanism used by the pipeline, for common e
 
 ## Data Snapshotting
 
-| Parameter                           | Description                                 | Type   | Required or Optional | Default |
-| ----------------------------------- | ------------------------------------------- | ------ | -------------------- | ------- |
-| `GRAPHRAG_SNAPSHOT_GRAPHML`         | Whether to enable GraphML snapshots.        | `bool` | optional             | False   |
-| `GRAPHRAG_SNAPSHOT_RAW_ENTITIES`    | Whether to enable raw entity snapshots.     | `bool` | optional             | False   |
-| `GRAPHRAG_SNAPSHOT_TOP_LEVEL_NODES` | Whether to enable top-level node snapshots. | `bool` | optional             | False   |
+| Parameter                              | Description                                     | Type   | Required or Optional | Default |
+| -------------------------------------- | ----------------------------------------------- | ------ | -------------------- | ------- |
+| `GRAPHRAG_SNAPSHOT_EMBEDDINGS`         | Whether to enable embeddings snapshots.         | `bool` | optional             | False   |
+| `GRAPHRAG_SNAPSHOT_GRAPHML`            | Whether to enable GraphML snapshots.            | `bool` | optional             | False   |
+| `GRAPHRAG_SNAPSHOT_RAW_ENTITIES`       | Whether to enable raw entity snapshots.         | `bool` | optional             | False   |
+| `GRAPHRAG_SNAPSHOT_TOP_LEVEL_NODES`    | Whether to enable top-level node snapshots.     | `bool` | optional             | False   |
+| `GRAPHRAG_SNAPSHOT_TRANSIENT`          | Whether to enable transient table snapshots.    | `bool` | optional             | False   |
 
 # Miscellaneous Settings
 
 
@@ -216,9 +216,11 @@ This is the base LLM configuration section. Other steps may override this config
 
 ### Fields
 
-- `graphml` **bool** - Emit graphml snapshots.
-- `raw_entities` **bool** - Emit raw entity snapshots.
-- `top_level_nodes` **bool** - Emit top-level-node snapshots.
+- `embeddings` **bool** - Emit embeddings snapshots to parquet.
+- `graphml` **bool** - Emit graph snapshots to GraphML.
+- `raw_entities` **bool** - Emit raw entity snapshots to JSON.
+- `top_level_nodes` **bool** - Emit top-level-node snapshots to JSON.
+- `transient` **bool** - Emit transient workflow tables snapshots to parquet.
 
 ## encoding_model
 
 
@@ -415,6 +415,7 @@ def hydrate_parallelization_params(
                 top_level_nodes=reader.bool("top_level_nodes")
                 or defs.SNAPSHOTS_TOP_LEVEL_NODES,
                 embeddings=reader.bool("embeddings") or defs.SNAPSHOTS_EMBEDDINGS,
+                transient=reader.bool("transient") or defs.SNAPSHOTS_TRANSIENT,
             )
         with reader.envvar_prefix(Section.umap), reader.use(values.get("umap")):
             umap_model = UmapConfig(
 
@@ -83,6 +83,7 @@
 SNAPSHOTS_RAW_ENTITIES = False
 SNAPSHOTS_TOP_LEVEL_NODES = False
 SNAPSHOTS_EMBEDDINGS = False
+SNAPSHOTS_TRANSIENT = False
 STORAGE_BASE_DIR = "output"
 STORAGE_TYPE = StorageType.file
 SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
 
@@ -9,6 +9,8 @@
 class SnapshotsConfigInput(TypedDict):
     """Configuration section for snapshots."""
 
+    embeddings: NotRequired[bool | str | None]
     graphml: NotRequired[bool | str | None]
     raw_entities: NotRequired[bool | str | None]
     top_level_nodes: NotRequired[bool | str | None]
+    transient: NotRequired[bool | str | None]
@@ -11,6 +11,10 @@
 class SnapshotsConfig(BaseModel):
     """Configuration section for snapshots."""
 
+    embeddings: bool = Field(
+        description="A flag indicating whether to take snapshots of embeddings.",
+        default=defs.SNAPSHOTS_EMBEDDINGS,
+    )
     graphml: bool = Field(
         description="A flag indicating whether to take snapshots of GraphML.",
         default=defs.SNAPSHOTS_GRAPHML,
@@ -23,7 +27,7 @@ class SnapshotsConfig(BaseModel):
         description="A flag indicating whether to take snapshots of top-level nodes.",
         default=defs.SNAPSHOTS_TOP_LEVEL_NODES,
     )
-    embeddings: bool = Field(
-        description="A flag indicating whether to take snapshots of embeddings.",
-        default=defs.SNAPSHOTS_EMBEDDINGS,
+    transient: bool = Field(
+        description="A flag indicating whether to take snapshots of transient tables.",
+        default=defs.SNAPSHOTS_TRANSIENT,
     )
@@ -171,6 +171,7 @@ def _text_unit_workflows(
         PipelineWorkflowReference(
             name=create_base_text_units,
             config={
+                "snapshot_transient": settings.snapshots.transient,
                 "chunk_by": settings.chunks.group_by_columns,
                 "text_chunk": {
                     "strategy": settings.chunks.resolved_strategy(
@@ -215,7 +216,9 @@ def _graph_workflows(settings: GraphRagConfig) -> list[PipelineWorkflowReference
         PipelineWorkflowReference(
             name=create_base_entity_graph,
             config={
-                "graphml_snapshot": settings.snapshots.graphml,
+                "snapshot_graphml": settings.snapshots.graphml,
+                "snapshot_transient": settings.snapshots.transient,
+                "snapshot_raw_entities": settings.snapshots.raw_entities,
                 "entity_extract": {
                     **settings.entity_extraction.parallelization.model_dump(),
                     "async_mode": settings.entity_extraction.async_mode,
 
@@ -42,8 +42,9 @@ async def create_base_entity_graph(
     summarization_strategy: dict[str, Any] | None = None,
     summarization_num_threads: int = 4,
     embedding_strategy: dict[str, Any] | None = None,
-    graphml_snapshot_enabled: bool = False,
-    raw_entity_snapshot_enabled: bool = False,
+    snapshot_graphml_enabled: bool = False,
+    snapshot_raw_entities_enabled: bool = False,
+    snapshot_transient_enabled: bool = False,
 ) -> pd.DataFrame:
     """All the steps to create the base entity graph."""
     # this returns a graph for each text unit, to be merged later
@@ -92,15 +93,15 @@ async def create_base_entity_graph(
             strategy=embedding_strategy,
         )
 
-    if raw_entity_snapshot_enabled:
+    if snapshot_raw_entities_enabled:
         await snapshot(
             entities,
             name="raw_extracted_entities",
             storage=storage,
             formats=["json"],
         )
 
-    if graphml_snapshot_enabled:
+    if snapshot_graphml_enabled:
         await snapshot_graphml(
             merged_graph,
             name="merged_graph",
@@ -131,4 +132,14 @@ async def create_base_entity_graph(
     if embedding_strategy:
         final_columns.append("embeddings")
 
-    return cast(pd.DataFrame, clustered[final_columns])
+    output = cast(pd.DataFrame, clustered[final_columns])
+
+    if snapshot_transient_enabled:
+        await snapshot(
+            output,
+            name="create_base_entity_graph",
+            storage=storage,
+            formats=["parquet"],
+        )
+
+    return output
@@ -15,16 +15,20 @@
 )
 
 from graphrag.index.operations.chunk_text import chunk_text
+from graphrag.index.operations.snapshot import snapshot
+from graphrag.index.storage import PipelineStorage
 from graphrag.index.utils import gen_md5_hash
 
 
-def create_base_text_units(
+async def create_base_text_units(
     documents: pd.DataFrame,
     callbacks: VerbCallbacks,
+    storage: PipelineStorage,
     chunk_column_name: str,
     n_tokens_column_name: str,
     chunk_by_columns: list[str],
     chunk_strategy: dict[str, Any] | None = None,
+    snapshot_transient_enabled: bool = False,
 ) -> pd.DataFrame:
     """All the steps to transform base text_units."""
     sort = documents.sort_values(by=["id"], ascending=[True])
@@ -73,10 +77,20 @@ def create_base_text_units(
     )
     chunked["id"] = chunked["chunk_id"]
 
-    return cast(
+    output = cast(
         pd.DataFrame, chunked[chunked[chunk_column_name].notna()].reset_index(drop=True)
     )
 
+    if snapshot_transient_enabled:
+        await snapshot(
+            output,
+            name="create_base_text_units",
+            storage=storage,
+            formats=["parquet"],
+        )
+
+    return output
+
 
 # TODO: would be nice to inline this completely in the main method with pandas
 def _aggregate_df(
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Transient entity graph and snapshotting."
 +}
Original file line number	Diff line number	Diff line change
`@@ -415,6 +415,7 @@ def hydrate_parallelization_params(`
`415`	`415`	`top_level_nodes=reader.bool("top_level_nodes")`
`416`	`416`	`or defs.SNAPSHOTS_TOP_LEVEL_NODES,`
`417`	`417`	`embeddings=reader.bool("embeddings") or defs.SNAPSHOTS_EMBEDDINGS,`
	`418`	`+ transient=reader.bool("transient") or defs.SNAPSHOTS_TRANSIENT,`
`418`	`419`	`)`
`419`	`420`	`with reader.envvar_prefix(Section.umap), reader.use(values.get("umap")):`
`420`	`421`	`umap_model = UmapConfig(`