Skip to content

Commit 7235c6f

Browse files
Add Incremental Indexing v1 (#1318)
* Create entypoint for cli and api (#1067) * Add cli and api entrypoints for update index * Semver * Update docs * Run tests on feature branch main * Better /main handling in tests * Incremental indexing/file delta (#1123) * Calculate new inputs and deleted inputs on update * Semver * Clear ruff checks * Fix pyright * Fix PyRight * Ruff again * Update relationships after inc index (#1236) * Collapse create final community reports (#1227) * Remove extraneous param * Add community report mocking assertions * Collapse primary report generation * Collapse embeddings * Format * Semver * Remove extraneous check * Move option set * Collapse create base entity graph (#1233) * Collapse create_base_entity_graph * Format/typing * Semver * Fix smoke tests * Simplify assignment * Collapse create summarized entities (#1237) * Collapse entity summarize * Semver * Collapse create base extracted entities (#1235) * Set up base assertions * Replace entity_extract * Finish collapsing workflow * Semver * Update snoke tests * Incremental indexing/update final text units (#1241) * Update final text units * Format * Address comments * Add v1 community merge using time period (#1257) * Add naive community merge using time period * formatting * Query fixes * Add descriptions from merged_entities * Add summarization and embeddings * Use iso format * Ruff * Pyright and smoke tests * Pyright * Pyright * Update parquet for verb tests * Fix smoke tests * Remove sorting * Update smoke tests * Smoke tests * Smoke tests * Updated verb test to ack for latest changes on covariates * Add config for incremental index + Bug fixes (#1317) * Add config for incremental index + Bug fixes * Ruff * Fix smoke tests * Semversioner * Small refactor * Remove unused file * Ruff * Update verb tests inputs * Update verb tests inputs --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com>
1 parent 0cc79b9 commit 7235c6f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+940
-247
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "minor",
3+
"description": "Add Incremental Indexing"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Add relationship merge"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Add text units update"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Add naive community merge using time period"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Add config for incremental updates"
4+
}

graphrag/api/index.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ async def build_index(
2424
config: GraphRagConfig,
2525
run_id: str = "",
2626
is_resume_run: bool = False,
27-
is_update_run: bool = False,
2827
memory_profile: bool = False,
2928
progress_reporter: ProgressReporter | None = None,
3029
emit: list[TableEmitterType] = [TableEmitterType.Parquet], # noqa: B006
@@ -54,6 +53,8 @@ async def build_index(
5453
list[PipelineRunResult]
5554
The list of pipeline run results
5655
"""
56+
is_update_run = bool(config.update_index_storage)
57+
5758
if is_resume_run and is_update_run:
5859
msg = "Cannot resume and update a run at the same time."
5960
raise ValueError(msg)

graphrag/cli/index.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def index_cli(
6969
root_dir: Path,
7070
verbose: bool,
7171
resume: str | None,
72-
update_index_id: str | None,
7372
memprofile: bool,
7473
cache: bool,
7574
reporter: ReporterType,
@@ -82,7 +81,7 @@ def index_cli(
8281
"""Run the pipeline with the given config."""
8382
progress_reporter = create_progress_reporter(reporter)
8483
info, error, success = _logger(progress_reporter)
85-
run_id = resume or update_index_id or time.strftime("%Y%m%d-%H%M%S")
84+
run_id = resume or time.strftime("%Y%m%d-%H%M%S")
8685

8786
config = load_config(root_dir, config_filepath)
8887
config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir
@@ -123,7 +122,6 @@ def index_cli(
123122
config=config,
124123
run_id=run_id,
125124
is_resume_run=bool(resume),
126-
is_update_run=bool(update_index_id),
127125
memory_profile=memprofile,
128126
progress_reporter=progress_reporter,
129127
emit=emit,

graphrag/cli/main.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,6 @@ def _index_cli(
102102
help="Skip any preflight validation. Useful when running no LLM steps."
103103
),
104104
] = False,
105-
update_index: Annotated[
106-
str | None,
107-
typer.Option(
108-
help="Update an index run id, leveraging previous outputs and applying new indexes."
109-
),
110-
] = None,
111105
output: Annotated[
112106
Path | None,
113107
typer.Option(
@@ -119,15 +113,10 @@ def _index_cli(
119113
] = None,
120114
):
121115
"""Build a knowledge graph index."""
122-
if resume and update_index:
123-
msg = "Cannot resume and update a run at the same time"
124-
raise ValueError(msg)
125-
126116
index_cli(
127117
root_dir=root,
128118
verbose=verbose,
129119
resume=resume,
130-
update_index_id=update_index,
131120
memprofile=memprofile,
132121
cache=cache,
133122
reporter=ReporterType(reporter),

graphrag/config/create_graphrag_config.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,25 @@ def hydrate_parallelization_params(
375375
container_name=reader.str(Fragment.container_name),
376376
base_dir=reader.str(Fragment.base_dir) or defs.STORAGE_BASE_DIR,
377377
)
378+
379+
with (
380+
reader.envvar_prefix(Section.update_index_storage),
381+
reader.use(values.get("update_index_storage")),
382+
):
383+
s_type = reader.str(Fragment.type)
384+
if s_type:
385+
update_index_storage_model = StorageConfig(
386+
type=StorageType(s_type) if s_type else defs.STORAGE_TYPE,
387+
connection_string=reader.str(Fragment.conn_string),
388+
storage_account_blob_url=reader.str(
389+
Fragment.storage_account_blob_url
390+
),
391+
container_name=reader.str(Fragment.container_name),
392+
base_dir=reader.str(Fragment.base_dir)
393+
or defs.UPDATE_STORAGE_BASE_DIR,
394+
)
395+
else:
396+
update_index_storage_model = None
378397
with reader.envvar_prefix(Section.chunk), reader.use(values.get("chunks")):
379398
group_by_columns = reader.list("group_by_columns", "BY_COLUMNS")
380399
if group_by_columns is None:
@@ -547,6 +566,7 @@ def hydrate_parallelization_params(
547566
embed_graph=embed_graph_model,
548567
reporting=reporting_model,
549568
storage=storage_model,
569+
update_index_storage=update_index_storage_model,
550570
cache=cache_model,
551571
input=input_model,
552572
chunks=chunks_model,
@@ -624,6 +644,7 @@ class Section(str, Enum):
624644
storage = "STORAGE"
625645
summarize_descriptions = "SUMMARIZE_DESCRIPTIONS"
626646
umap = "UMAP"
647+
update_index_storage = "UPDATE_INDEX_STORAGE"
627648
local_search = "LOCAL_SEARCH"
628649
global_search = "GLOBAL_SEARCH"
629650

graphrag/config/defaults.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
STORAGE_TYPE = StorageType.file
8787
SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
8888
UMAP_ENABLED = False
89+
UPDATE_STORAGE_BASE_DIR = "update_output"
8990

9091
VECTOR_STORE = f"""
9192
type: {VectorStoreType.LanceDB.value}

0 commit comments

Comments
 (0)