microsoft
diff --git a/‎.semversioner/next-release/minor-20250212001549728393.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/minor-20250212001549728393.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎graphrag/api/index.py‎
Lines changed: 1 addition & 2 deletions b/‎graphrag/api/index.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎graphrag/cli/index.py‎
Lines changed: 7 additions & 10 deletions b/‎graphrag/cli/index.py‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎graphrag/config/defaults.py‎
Lines changed: 1 addition & 2 deletions b/‎graphrag/config/defaults.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎graphrag/config/init_content.py‎
Lines changed: 0 additions & 6 deletions b/‎graphrag/config/init_content.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎graphrag/config/models/graph_rag_config.py‎
Lines changed: 7 additions & 7 deletions b/‎graphrag/config/models/graph_rag_config.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎graphrag/index/run/run_pipeline.py‎
Lines changed: 52 additions & 37 deletions b/‎graphrag/index/run/run_pipeline.py‎
Lines changed: 52 additions & 37 deletions
diff --git a/‎graphrag/index/update/entities.py‎
Lines changed: 10 additions & 0 deletions b/‎graphrag/index/update/entities.py‎
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Rework the update output storage structure."
+}
@@ -26,6 +26,7 @@
 async def build_index(
     config: GraphRagConfig,
     method: IndexingMethod = IndexingMethod.Standard,
+    is_update_run: bool = False,
     memory_profile: bool = False,
     callbacks: list[WorkflowCallbacks] | None = None,
     progress_logger: ProgressLogger | None = None,
@@ -50,8 +51,6 @@ async def build_index(
     list[PipelineRunResult]
         The list of pipeline run results
     """
-    is_update_run = bool(config.update_index_output)
-
     pipeline_cache = (
         NoopPipelineCache() if config.cache.type == CacheType.none is None else None
     )
 
@@ -78,11 +78,13 @@ def index_cli(
     if output_dir:
         cli_overrides["output.base_dir"] = str(output_dir)
         cli_overrides["reporting.base_dir"] = str(output_dir)
+        cli_overrides["update_index_output.base_dir"] = str(output_dir)
     config = load_config(root_dir, config_filepath, cli_overrides)
 
     _run_index(
         config=config,
         method=method,
+        is_update_run=False,
         verbose=verbose,
         memprofile=memprofile,
         cache=cache,
@@ -108,21 +110,14 @@ def update_cli(
     if output_dir:
         cli_overrides["output.base_dir"] = str(output_dir)
         cli_overrides["reporting.base_dir"] = str(output_dir)
-    config = load_config(root_dir, config_filepath, cli_overrides)
-
-    # Check if update output exist, if not configure it with default values
-    if not config.update_index_output:
-        from graphrag.config.defaults import OUTPUT_TYPE, UPDATE_OUTPUT_BASE_DIR
-        from graphrag.config.models.output_config import OutputConfig
+        cli_overrides["update_index_output.base_dir"] = str(output_dir)
 
-        config.update_index_output = OutputConfig(
-            type=OUTPUT_TYPE,
-            base_dir=UPDATE_OUTPUT_BASE_DIR,
-        )
+    config = load_config(root_dir, config_filepath, cli_overrides)
 
     _run_index(
         config=config,
         method=method,
+        is_update_run=True,
         verbose=verbose,
         memprofile=memprofile,
         cache=cache,
@@ -135,6 +130,7 @@ def update_cli(
 def _run_index(
     config,
     method,
+    is_update_run,
     verbose,
     memprofile,
     cache,
@@ -176,6 +172,7 @@ def _run_index(
         api.build_index(
             config=config,
             method=method,
+            is_update_run=is_update_run,
             memory_profile=memprofile,
             progress_logger=progress_logger,
         )
 
@@ -128,12 +128,11 @@
 SNAPSHOTS_GRAPHML = False
 SNAPSHOTS_EMBEDDINGS = False
 OUTPUT_BASE_DIR = "output"
-OUTPUT_DEFAULT_ID = "default_output"
 OUTPUT_TYPE = OutputType.file
+UPDATE_OUTPUT_BASE_DIR = "update_output"
 SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
 SUMMARIZE_MODEL_ID = DEFAULT_CHAT_MODEL_ID
 UMAP_ENABLED = False
-UPDATE_OUTPUT_BASE_DIR = "update_output"
 
 # Graph Pruning
 PRUNE_MIN_NODE_FREQ = 2
 
@@ -91,12 +91,6 @@
   type: {defs.OUTPUT_TYPE.value} # [file, blob, cosmosdb]
   base_dir: "{defs.OUTPUT_BASE_DIR}"
 
-## only turn this on if running `graphrag index` with custom settings
-## we normally use `graphrag update` with the defaults
-update_index_output:
-  # type: {defs.OUTPUT_TYPE.value} # [file, blob, cosmosdb]
-  # base_dir: "{defs.UPDATE_OUTPUT_BASE_DIR}"
-
 ### Workflow settings ###
 
 extract_graph:
 
@@ -134,20 +134,20 @@ def _validate_multi_output_base_dirs(self) -> None:
                         (Path(self.root_dir) / output.base_dir).resolve()
                     )
 
-    update_index_output: OutputConfig | None = Field(
+    update_index_output: OutputConfig = Field(
         description="The output configuration for the updated index.",
-        default=None,
+        default=OutputConfig(
+            type=defs.OUTPUT_TYPE,
+            base_dir=defs.UPDATE_OUTPUT_BASE_DIR,
+        ),
     )
     """The output configuration for the updated index."""
 
     def _validate_update_index_output_base_dir(self) -> None:
         """Validate the update index output base directory."""
-        if (
-            self.update_index_output
-            and self.update_index_output.type == defs.OutputType.file
-        ):
+        if self.update_index_output.type == defs.OutputType.file:
             if self.update_index_output.base_dir.strip() == "":
-                msg = "Update index output base directory is required for file output. Please rerun `graphrag init` and set the update index output configuration."
+                msg = "update_index_output base directory is required for file output. Please rerun `graphrag init` and set the update_index_output configuration."
                 raise ValueError(msg)
             self.update_index_output.base_dir = str(
                 (Path(self.root_dir) / self.update_index_output.base_dir).resolve()
 
@@ -5,6 +5,7 @@
 
 import json
 import logging
+import re
 import time
 import traceback
 from collections.abc import AsyncIterable
@@ -31,7 +32,7 @@
 from graphrag.logger.progress import Progress
 from graphrag.storage.factory import StorageFactory
 from graphrag.storage.pipeline_storage import PipelineStorage
-from graphrag.utils.storage import write_table_to_storage
+from graphrag.utils.storage import load_table_from_storage, write_table_to_storage
 
 log = logging.getLogger(__name__)
 
@@ -66,45 +67,49 @@ async def run_pipeline(
     if is_update_run:
         progress_logger.info("Running incremental indexing.")
 
-        update_storage_config = config.update_index_output.model_dump()  # type: ignore
-        update_index_storage = StorageFactory().create_storage(
-            storage_type=update_storage_config["type"],  # type: ignore
-            kwargs=update_storage_config,
-        )
-
         delta_dataset = await get_delta_docs(dataset, storage)
 
-        # Fail on empty delta dataset
+        # warn on empty delta dataset
         if delta_dataset.new_inputs.empty:
-            error_msg = "Incremental Indexing Error: No new documents to process."
-            raise ValueError(error_msg)
-
-        delta_storage = update_index_storage.child("delta")
-
-        # Run the pipeline on the new documents
-        tables_dict = {}
-        async for table in _run_pipeline(
-            pipeline=pipeline,
-            config=config,
-            dataset=delta_dataset.new_inputs,
-            cache=cache,
-            storage=delta_storage,
-            callbacks=callback_chain,
-            logger=progress_logger,
-        ):
-            tables_dict[table.workflow] = table.result
-
-        progress_logger.success("Finished running workflows on new documents.")
-
-        await update_dataframe_outputs(
-            dataframe_dict=tables_dict,
-            storage=storage,
-            update_storage=update_index_storage,
-            config=config,
-            cache=cache,
-            callbacks=NoopWorkflowCallbacks(),
-            progress_logger=progress_logger,
-        )
+            warning_msg = "Incremental indexing found no new documents, exiting."
+            progress_logger.warning(warning_msg)
+        else:
+            update_storage_config = config.update_index_output.model_dump()  # type: ignore
+            update_storage = StorageFactory().create_storage(
+                storage_type=update_storage_config["type"],  # type: ignore
+                kwargs=update_storage_config,
+            )
+            # we use this to store the new subset index, and will merge its content with the previous index
+            timestamped_storage = update_storage.child(time.strftime("%Y%m%d-%H%M%S"))
+            delta_storage = timestamped_storage.child("delta")
+            # copy the previous output to a backup folder, so we can replace it with the update
+            # we'll read from this later when we merge the old and new indexes
+            previous_storage = timestamped_storage.child("previous")
+            await _copy_previous_output(storage, previous_storage)
+
+            # Run the pipeline on the new documents
+            async for table in _run_pipeline(
+                pipeline=pipeline,
+                config=config,
+                dataset=delta_dataset.new_inputs,
+                cache=cache,
+                storage=delta_storage,
+                callbacks=callback_chain,
+                logger=progress_logger,
+            ):
+                yield table
+
+            progress_logger.success("Finished running workflows on new documents.")
+
+            await update_dataframe_outputs(
+                previous_storage=previous_storage,
+                delta_storage=delta_storage,
+                output_storage=storage,
+                config=config,
+                cache=cache,
+                callbacks=NoopWorkflowCallbacks(),
+                progress_logger=progress_logger,
+            )
 
     else:
         progress_logger.info("Running standard indexing.")
@@ -172,3 +177,13 @@ async def _dump_stats(stats: PipelineRunStats, storage: PipelineStorage) -> None
     await storage.set(
         "stats.json", json.dumps(asdict(stats), indent=4, ensure_ascii=False)
     )
+
+
+async def _copy_previous_output(
+    storage: PipelineStorage,
+    copy_storage: PipelineStorage,
+):
+    for file in storage.find(re.compile(r"\.parquet$")):
+        base_name = file[0].replace(".parquet", "")
+        table = await load_table_from_storage(base_name, storage)
+        await write_table_to_storage(table, base_name, copy_storage)
@@ -65,10 +65,16 @@ def _group_and_resolve_entities(
             "description": lambda x: list(x.astype(str)),  # Ensure str
             # Concatenate nd.array into a single list
             "text_unit_ids": lambda x: list(itertools.chain(*x.tolist())),
+            "degree": "first",  # todo: we could probably re-compute this with the entire new graph
+            "x": "first",
+            "y": "first",
         })
         .reset_index()
     )
 
+    # recompute frequency to include new text units
+    aggregated["frequency"] = aggregated["text_unit_ids"].apply(len)
+
     # Force the result into a DataFrame
     resolved: pd.DataFrame = pd.DataFrame(aggregated)
 
@@ -82,6 +88,10 @@ def _group_and_resolve_entities(
             "type",
             "description",
             "text_unit_ids",
+            "frequency",
+            "degree",
+            "x",
+            "y",
         ],
     ]
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "minor",
 +  "description": "Rework the update output storage structure."
 +}