Skip to content

Commit 0f54cb1

Browse files
committed
Add perf fixes back in
1 parent 41eda41 commit 0f54cb1

File tree

1 file changed

+17
-24
lines changed

1 file changed

+17
-24
lines changed

graphrag/index/operations/build_noun_graph/build_noun_graph.py

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""Graph extraction using NLP."""
55

66
from itertools import combinations
7+
from typing import Any
78

89
import numpy as np
910
import pandas as pd
@@ -31,7 +32,6 @@ async def build_noun_graph(
3132
text_units, text_analyzer, num_threads=num_threads, cache=cache
3233
)
3334
edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights)
34-
3535
return (nodes_df, edges_df)
3636

3737

@@ -95,35 +95,28 @@ def _extract_edges(
9595
"""
9696
text_units_df = nodes_df.explode("text_unit_ids")
9797
text_units_df = text_units_df.rename(columns={"text_unit_ids": "text_unit_id"})
98+
9899
text_units_df = (
99-
text_units_df.groupby("text_unit_id").agg({"title": list}).reset_index()
100+
text_units_df.groupby("text_unit_id")
101+
.agg({"title": lambda x: list(x) if len(x) > 1 else np.nan})
102+
.reset_index()
100103
)
101-
102-
text_units_df["edges"] = text_units_df["title"].apply(
103-
lambda x: list(combinations(x, 2))
104+
text_units_df = text_units_df.dropna()
105+
titles = text_units_df["title"].tolist()
106+
all_edges: Any = [list(combinations(t, 2)) for t in titles]
107+
108+
text_units_df = text_units_df.assign(edges=all_edges)
109+
edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]]
110+
111+
edge_df[["source", "target"]] = edge_df["edges"].to_list()
112+
edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
113+
edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
114+
edge_df = edge_df.drop(columns=["source", "target"]).rename(
115+
columns={"min_source": "source", "max_target": "target"}
104116
)
105117

106-
edge_df = text_units_df.explode("edges").loc[:, ["edges", "text_unit_id"]]
107-
108-
edge_df["source"] = edge_df["edges"].apply(
109-
lambda x: x[0] if isinstance(x, tuple) else None
110-
)
111-
edge_df["target"] = edge_df["edges"].apply(
112-
lambda x: x[1] if isinstance(x, tuple) else None
113-
)
114118
edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
115119
edge_df = edge_df.drop(columns=["edges"])
116-
# make sure source is always smaller than target
117-
edge_df["source"], edge_df["target"] = zip(
118-
*edge_df.apply(
119-
lambda x: (x["source"], x["target"])
120-
if x["source"] < x["target"]
121-
else (x["target"], x["source"]),
122-
axis=1,
123-
),
124-
strict=False,
125-
)
126-
127120
# group by source and target, count the number of text units
128121
grouped_edge_df = (
129122
edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()

0 commit comments

Comments
 (0)