Add perf fixes back in

natoverse · natoverse · commit 0f54cb109bc8 · 2025-04-24T15:39:57.000-07:00
diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -4,6 +4,7 @@
 """Graph extraction using NLP."""
 
 from itertools import combinations
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -31,7 +32,6 @@ async def build_noun_graph(
         text_units, text_analyzer, num_threads=num_threads, cache=cache
     )
     edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights)
-
     return (nodes_df, edges_df)
 
 
@@ -95,35 +95,28 @@ def _extract_edges(
     """
     text_units_df = nodes_df.explode("text_unit_ids")
     text_units_df = text_units_df.rename(columns={"text_unit_ids": "text_unit_id"})
+
     text_units_df = (
-        text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index()
+        text_units_df.groupby("text_unit_id")
+        .agg({"title": lambda x: list(x) if len(x) > 1 else np.nan})
+        .reset_index()
     )
-
-    text_units_df["edges"] = text_units_df["title"].apply(
-        lambda x: list(combinations(x, 2))
+    text_units_df = text_units_df.dropna()
+    titles = text_units_df["title"].tolist()
+    all_edges: Any = [list(combinations(t, 2)) for t in titles]
+
+    text_units_df = text_units_df.assign(edges=all_edges)
+    edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]]
+
+    edge_df[["source", "target"]] = edge_df["edges"].to_list()
+    edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
+    edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
+    edge_df = edge_df.drop(columns=["source", "target"]).rename(
+        columns={"min_source": "source", "max_target": "target"}
     )
 
-    edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
-
-    edge_df["source"] = edge_df["edges"].apply(
-        lambda x: x[0] if isinstance(x, tuple) else None
-    )
-    edge_df["target"] = edge_df["edges"].apply(
-        lambda x: x[1] if isinstance(x, tuple) else None
-    )
     edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
     edge_df = edge_df.drop(columns=["edges"])
-    # make sure source is always smaller than target
-    edge_df["source"], edge_df["target"] = zip(
-        *edge_df.apply(
-            lambda x: (x["source"], x["target"])
-            if x["source"] < x["target"]
-            else (x["target"], x["source"]),
-            axis=1,
-        ),
-        strict=False,
-    )
-
     # group by source and target, count the number of text units
     grouped_edge_df = (
         edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()