microsoft
diff --git a/‎.semversioner/next-release/patch-20250211001113319704.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20250211001113319704.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎graphrag/api/index.py‎
Lines changed: 8 additions & 3 deletions b/‎graphrag/api/index.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎graphrag/index/flows/extract_graph.py‎
Lines changed: 1 addition & 10 deletions b/‎graphrag/index/flows/extract_graph.py‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎graphrag/index/flows/extract_graph_nlp.py‎
Lines changed: 4 additions & 44 deletions b/‎graphrag/index/flows/extract_graph_nlp.py‎
Lines changed: 4 additions & 44 deletions
diff --git a/‎graphrag/index/flows/finalize_graph.py‎
Lines changed: 26 additions & 0 deletions b/‎graphrag/index/flows/finalize_graph.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎graphrag/index/flows/prune_graph.py‎
Lines changed: 43 additions & 0 deletions b/‎graphrag/index/flows/prune_graph.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎graphrag/index/operations/build_noun_graph/build_noun_graph.py‎
Lines changed: 6 additions & 6 deletions b/‎graphrag/index/operations/build_noun_graph/build_noun_graph.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎graphrag/index/operations/create_graph.py‎
Lines changed: 1 addition & 1 deletion b/‎graphrag/index/operations/create_graph.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphrag/index/operations/extract_graph/extract_graph.py‎
Lines changed: 5 additions & 1 deletion b/‎graphrag/index/operations/extract_graph/extract_graph.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎graphrag/index/operations/finalize_entities.py‎
Lines changed: 1 addition & 0 deletions b/‎graphrag/index/operations/finalize_entities.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Separates graph pruning for differential usage."
+}
@@ -16,8 +16,8 @@
 from graphrag.config.enums import CacheType, IndexingMethod
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.run.run_pipeline import run_pipeline
-from graphrag.index.typing import PipelineRunResult
-from graphrag.index.workflows.factory import create_pipeline
+from graphrag.index.typing import PipelineRunResult, WorkflowFunction
+from graphrag.index.workflows.factory import PipelineFactory
 from graphrag.logger.base import ProgressLogger
 
 log = logging.getLogger(__name__)
@@ -63,7 +63,7 @@ async def build_index(
     if memory_profile:
         log.warning("New pipeline does not yet support memory profiling.")
 
-    pipeline = create_pipeline(config, method)
+    pipeline = PipelineFactory.create_pipeline(config, method)
 
     async for output in run_pipeline(
         pipeline,
@@ -82,3 +82,8 @@ async def build_index(
             progress_logger.info(str(output.result))
 
     return outputs
+
+
+def register_workflow_function(name: str, workflow: WorkflowFunction):
+    """Register a custom workflow function. You can then include the name in the settings.yaml workflows list."""
+    PipelineFactory.register(name, workflow)
@@ -10,12 +10,9 @@
 from graphrag.cache.pipeline_cache import PipelineCache
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.config.enums import AsyncType
-from graphrag.config.models.embed_graph_config import EmbedGraphConfig
 from graphrag.index.operations.extract_graph.extract_graph import (
     extract_graph as extractor,
 )
-from graphrag.index.operations.finalize_entities import finalize_entities
-from graphrag.index.operations.finalize_relationships import finalize_relationships
 from graphrag.index.operations.summarize_descriptions import (
     summarize_descriptions,
 )
@@ -31,8 +28,6 @@ async def extract_graph(
     entity_types: list[str] | None = None,
     summarization_strategy: dict[str, Any] | None = None,
     summarization_num_threads: int = 4,
-    embed_config: EmbedGraphConfig | None = None,
-    layout_enabled: bool = False,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """All the steps to create the base entity graph."""
     # this returns a graph for each text unit, to be merged later
@@ -76,11 +71,7 @@ async def extract_graph(
     extracted_entities.drop(columns=["description"], inplace=True)
     entities = extracted_entities.merge(entity_summaries, on="title", how="left")
 
-    final_entities = finalize_entities(
-        entities, relationships, callbacks, embed_config, layout_enabled
-    )
-    final_relationships = finalize_relationships(relationships)
-    return (final_entities, final_relationships)
+    return (entities, relationships)
 
 
 def _validate_data(df: pd.DataFrame) -> bool:
 
@@ -6,29 +6,17 @@
 import pandas as pd
 
 from graphrag.cache.pipeline_cache import PipelineCache
-from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
-from graphrag.config.models.embed_graph_config import EmbedGraphConfig
 from graphrag.config.models.extract_graph_nlp_config import ExtractGraphNLPConfig
-from graphrag.config.models.prune_graph_config import PruneGraphConfig
 from graphrag.index.operations.build_noun_graph.build_noun_graph import build_noun_graph
 from graphrag.index.operations.build_noun_graph.np_extractors.factory import (
     create_noun_phrase_extractor,
 )
-from graphrag.index.operations.create_graph import create_graph
-from graphrag.index.operations.finalize_entities import finalize_entities
-from graphrag.index.operations.finalize_relationships import finalize_relationships
-from graphrag.index.operations.graph_to_dataframes import graph_to_dataframes
-from graphrag.index.operations.prune_graph import prune_graph
 
 
 async def extract_graph_nlp(
     text_units: pd.DataFrame,
-    callbacks: WorkflowCallbacks,
     cache: PipelineCache,
     extraction_config: ExtractGraphNLPConfig,
-    pruning_config: PruneGraphConfig,
-    embed_config: EmbedGraphConfig | None = None,
-    layout_enabled: bool = False,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """All the steps to create the base entity graph."""
     text_analyzer_config = extraction_config.text_analyzer
@@ -41,37 +29,9 @@ async def extract_graph_nlp(
         cache=cache,
     )
 
-    # create a temporary graph to prune, then turn it back into dataframes
-    graph = create_graph(extracted_edges, edge_attr=["weight"], nodes=extracted_nodes)
-    pruned = prune_graph(
-        graph,
-        min_node_freq=pruning_config.min_node_freq,
-        max_node_freq_std=pruning_config.max_node_freq_std,
-        min_node_degree=pruning_config.min_node_degree,
-        max_node_degree_std=pruning_config.max_node_degree_std,
-        min_edge_weight_pct=pruning_config.min_edge_weight_pct,
-        remove_ego_nodes=pruning_config.remove_ego_nodes,
-        lcc_only=pruning_config.lcc_only,
-    )
-
-    pruned_nodes, pruned_edges = graph_to_dataframes(
-        pruned, node_columns=["title"], edge_columns=["source", "target"]
-    )
-
-    # subset the full nodes and edges to only include the pruned remainders
-    joined_nodes = pruned_nodes.merge(extracted_nodes, on="title", how="inner")
-    joined_edges = pruned_edges.merge(
-        extracted_edges, on=["source", "target"], how="inner"
-    )
-
     # add in any other columns required by downstream workflows
-    joined_nodes["type"] = "NOUN PHRASE"
-    joined_nodes["description"] = ""
+    extracted_nodes["type"] = "NOUN PHRASE"
+    extracted_nodes["description"] = ""
+    extracted_edges["description"] = ""
 
-    joined_edges["description"] = ""
-
-    final_entities = finalize_entities(
-        joined_nodes, joined_edges, callbacks, embed_config, layout_enabled
-    )
-    final_relationships = finalize_relationships(joined_edges)
-    return (final_entities, final_relationships)
+    return (extracted_nodes, extracted_edges)
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""All the steps to create the base entity graph."""
+
+import pandas as pd
+
+from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
+from graphrag.config.models.embed_graph_config import EmbedGraphConfig
+from graphrag.index.operations.finalize_entities import finalize_entities
+from graphrag.index.operations.finalize_relationships import finalize_relationships
+
+
+def finalize_graph(
+    entities: pd.DataFrame,
+    relationships: pd.DataFrame,
+    callbacks: WorkflowCallbacks,
+    embed_config: EmbedGraphConfig | None = None,
+    layout_enabled: bool = False,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """All the steps to finalize the entity and relationship formats."""
+    final_entities = finalize_entities(
+        entities, relationships, callbacks, embed_config, layout_enabled
+    )
+    final_relationships = finalize_relationships(relationships)
+    return (final_entities, final_relationships)
@@ -0,0 +1,43 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Prune a full graph based on graph statistics."""
+
+import pandas as pd
+
+from graphrag.config.models.prune_graph_config import PruneGraphConfig
+from graphrag.index.operations.create_graph import create_graph
+from graphrag.index.operations.graph_to_dataframes import graph_to_dataframes
+from graphrag.index.operations.prune_graph import prune_graph as prune_graph_operation
+
+
+def prune_graph(
+    entities: pd.DataFrame,
+    relationships: pd.DataFrame,
+    pruning_config: PruneGraphConfig,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Prune a full graph based on graph statistics."""
+    # create a temporary graph to prune, then turn it back into dataframes
+    graph = create_graph(relationships, edge_attr=["weight"], nodes=entities)
+    pruned = prune_graph_operation(
+        graph,
+        min_node_freq=pruning_config.min_node_freq,
+        max_node_freq_std=pruning_config.max_node_freq_std,
+        min_node_degree=pruning_config.min_node_degree,
+        max_node_degree_std=pruning_config.max_node_degree_std,
+        min_edge_weight_pct=pruning_config.min_edge_weight_pct,
+        remove_ego_nodes=pruning_config.remove_ego_nodes,
+        lcc_only=pruning_config.lcc_only,
+    )
+
+    pruned_nodes, pruned_edges = graph_to_dataframes(
+        pruned, node_columns=["title"], edge_columns=["source", "target"]
+    )
+
+    # subset the full nodes and edges to only include the pruned remainders
+    subset_entities = pruned_nodes.merge(entities, on="title", how="inner")
+    subset_relationships = pruned_edges.merge(
+        relationships, on=["source", "target"], how="inner"
+    )
+
+    return (subset_entities, subset_relationships)
@@ -44,7 +44,7 @@ async def _extract_nodes(
     Extract initial nodes and edges from text units.
 
     Input: text unit df with schema [id, text, document_id]
-    Returns a dataframe with schema [id, title, freq, text_unit_ids].
+    Returns a dataframe with schema [id, title, frequency, text_unit_ids].
     """
     cache = cache or NoopPipelineCache()
     cache = cache.child("extract_noun_phrases")
@@ -76,9 +76,9 @@ async def extract(row):
         noun_node_df.groupby("title").agg({"text_unit_id": list}).reset_index()
     )
     grouped_node_df = grouped_node_df.rename(columns={"text_unit_id": "text_unit_ids"})
-    grouped_node_df["freq"] = grouped_node_df["text_unit_ids"].apply(len)
-    grouped_node_df = grouped_node_df[["title", "freq", "text_unit_ids"]]
-    return grouped_node_df.loc[:, ["title", "freq", "text_unit_ids"]]
+    grouped_node_df["frequency"] = grouped_node_df["text_unit_ids"].apply(len)
+    grouped_node_df = grouped_node_df[["title", "frequency", "text_unit_ids"]]
+    return grouped_node_df.loc[:, ["title", "frequency", "text_unit_ids"]]
 
 
 def _extract_edges(
@@ -89,7 +89,7 @@ def _extract_edges(
     Extract edges from nodes.
 
     Nodes appear in the same text unit are connected.
-    Input: nodes_df with schema [id, title, freq, text_unit_ids]
+    Input: nodes_df with schema [id, title, frequency, text_unit_ids]
     Returns: edges_df with schema [source, target, weight, text_unit_ids]
     """
     text_units_df = nodes_df.explode("text_unit_ids")
@@ -156,7 +156,7 @@ def _calculate_pmi_edge_weights(
     nodes_df: pd.DataFrame,
     edges_df: pd.DataFrame,
     node_name_col="title",
-    node_freq_col="freq",
+    node_freq_col="frequency",
     edge_weight_col="weight",
     edge_source_col="source",
     edge_target_col="target",
 
@@ -18,6 +18,6 @@ def create_graph(
 
     if nodes is not None:
         nodes.set_index(node_id, inplace=True)
-        graph.add_nodes_from(nodes.to_dict("index").items())
+        graph.add_nodes_from((n, dict(d)) for n, d in nodes.iterrows())
 
     return graph
@@ -154,7 +154,11 @@ def _merge_entities(entity_dfs) -> pd.DataFrame:
     all_entities = pd.concat(entity_dfs, ignore_index=True)
     return (
         all_entities.groupby(["title", "type"], sort=False)
-        .agg(description=("description", list), text_unit_ids=("source_id", list))
+        .agg(
+            description=("description", list),
+            text_unit_ids=("source_id", list),
+            frequency=("source_id", "count"),
+        )
         .reset_index()
     )
 
 
@@ -59,6 +59,7 @@ def finalize_entities(
             "type",
             "description",
             "text_unit_ids",
+            "frequency",
             "degree",
             "x",
             "y",
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Separates graph pruning for differential usage."
 +}