Skip to content

Commit 25b605b

Browse files
authored
Snapshot full graph (#1889)
* Snapshot un-merged entities and relationships * Semver * Fix raw df modification
1 parent e2a4481 commit 25b605b

File tree

4 files changed

+22
-3
lines changed

4 files changed

+22
-3
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Add option to snapshot raw extractd graph tables."
4+
}

graphrag/config/defaults.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ class SnapshotsDefaults:
339339

340340
embeddings: bool = False
341341
graphml: bool = False
342+
raw_graph: bool = False
342343

343344

344345
@dataclass

graphrag/config/models/snapshots_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ class SnapshotsConfig(BaseModel):
1919
description="A flag indicating whether to take snapshots of GraphML.",
2020
default=graphrag_config_defaults.snapshots.graphml,
2121
)
22+
raw_graph: bool = Field(
23+
description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.",
24+
default=graphrag_config_defaults.snapshots.raw_graph,
25+
)

graphrag/index/workflows/extract_graph.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ async def run_workflow(
4343
config.root_dir, summarization_llm_settings
4444
)
4545

46-
entities, relationships = await extract_graph(
46+
entities, relationships, raw_entities, raw_relationships = await extract_graph(
4747
text_units=text_units,
4848
callbacks=context.callbacks,
4949
cache=context.cache,
@@ -58,6 +58,12 @@ async def run_workflow(
5858
await write_table_to_storage(entities, "entities", context.storage)
5959
await write_table_to_storage(relationships, "relationships", context.storage)
6060

61+
if config.snapshots.raw_graph:
62+
await write_table_to_storage(raw_entities, "raw_entities", context.storage)
63+
await write_table_to_storage(
64+
raw_relationships, "raw_relationships", context.storage
65+
)
66+
6167
return WorkflowFunctionOutput(
6268
result={
6369
"entities": entities,
@@ -76,7 +82,7 @@ async def extract_graph(
7682
entity_types: list[str] | None = None,
7783
summarization_strategy: dict[str, Any] | None = None,
7884
summarization_num_threads: int = 4,
79-
) -> tuple[pd.DataFrame, pd.DataFrame]:
85+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
8086
"""All the steps to create the base entity graph."""
8187
# this returns a graph for each text unit, to be merged later
8288
extracted_entities, extracted_relationships = await extractor(
@@ -103,6 +109,10 @@ async def extract_graph(
103109
callbacks.error(error_msg)
104110
raise ValueError(error_msg)
105111

112+
# copy these as is before any summarization
113+
raw_entities = extracted_entities.copy()
114+
raw_relationships = extracted_relationships.copy()
115+
106116
entities, relationships = await get_summarized_entities_relationships(
107117
extracted_entities=extracted_entities,
108118
extracted_relationships=extracted_relationships,
@@ -112,7 +122,7 @@ async def extract_graph(
112122
summarization_num_threads=summarization_num_threads,
113123
)
114124

115-
return (entities, relationships)
125+
return (entities, relationships, raw_entities, raw_relationships)
116126

117127

118128
async def get_summarized_entities_relationships(

0 commit comments

Comments
 (0)