66import pandas as pd
77
88from graphrag .cache .pipeline_cache import PipelineCache
9- from graphrag .callbacks .workflow_callbacks import WorkflowCallbacks
10- from graphrag .config .models .embed_graph_config import EmbedGraphConfig
119from graphrag .config .models .extract_graph_nlp_config import ExtractGraphNLPConfig
12- from graphrag .config .models .prune_graph_config import PruneGraphConfig
1310from graphrag .index .operations .build_noun_graph .build_noun_graph import build_noun_graph
1411from graphrag .index .operations .build_noun_graph .np_extractors .factory import (
1512 create_noun_phrase_extractor ,
1613)
17- from graphrag .index .operations .create_graph import create_graph
18- from graphrag .index .operations .finalize_entities import finalize_entities
19- from graphrag .index .operations .finalize_relationships import finalize_relationships
20- from graphrag .index .operations .graph_to_dataframes import graph_to_dataframes
21- from graphrag .index .operations .prune_graph import prune_graph
2214
2315
2416async def extract_graph_nlp (
2517 text_units : pd .DataFrame ,
26- callbacks : WorkflowCallbacks ,
2718 cache : PipelineCache ,
2819 extraction_config : ExtractGraphNLPConfig ,
29- pruning_config : PruneGraphConfig ,
30- embed_config : EmbedGraphConfig | None = None ,
31- layout_enabled : bool = False ,
3220) -> tuple [pd .DataFrame , pd .DataFrame ]:
3321 """All the steps to create the base entity graph."""
3422 text_analyzer_config = extraction_config .text_analyzer
@@ -41,37 +29,9 @@ async def extract_graph_nlp(
4129 cache = cache ,
4230 )
4331
44- # create a temporary graph to prune, then turn it back into dataframes
45- graph = create_graph (extracted_edges , edge_attr = ["weight" ], nodes = extracted_nodes )
46- pruned = prune_graph (
47- graph ,
48- min_node_freq = pruning_config .min_node_freq ,
49- max_node_freq_std = pruning_config .max_node_freq_std ,
50- min_node_degree = pruning_config .min_node_degree ,
51- max_node_degree_std = pruning_config .max_node_degree_std ,
52- min_edge_weight_pct = pruning_config .min_edge_weight_pct ,
53- remove_ego_nodes = pruning_config .remove_ego_nodes ,
54- lcc_only = pruning_config .lcc_only ,
55- )
56-
57- pruned_nodes , pruned_edges = graph_to_dataframes (
58- pruned , node_columns = ["title" ], edge_columns = ["source" , "target" ]
59- )
60-
61- # subset the full nodes and edges to only include the pruned remainders
62- joined_nodes = pruned_nodes .merge (extracted_nodes , on = "title" , how = "inner" )
63- joined_edges = pruned_edges .merge (
64- extracted_edges , on = ["source" , "target" ], how = "inner"
65- )
66-
6732 # add in any other columns required by downstream workflows
68- joined_nodes ["type" ] = "NOUN PHRASE"
69- joined_nodes ["description" ] = ""
33+ extracted_nodes ["type" ] = "NOUN PHRASE"
34+ extracted_nodes ["description" ] = ""
35+ extracted_edges ["description" ] = ""
7036
71- joined_edges ["description" ] = ""
72-
73- final_entities = finalize_entities (
74- joined_nodes , joined_edges , callbacks , embed_config , layout_enabled
75- )
76- final_relationships = finalize_relationships (joined_edges )
77- return (final_entities , final_relationships )
37+ return (extracted_nodes , extracted_edges )
0 commit comments