|
5 | 5 |
|
6 | 6 | import json |
7 | 7 | import logging |
| 8 | +import re |
8 | 9 | import time |
9 | 10 | import traceback |
10 | 11 | from collections.abc import AsyncIterable |
|
31 | 32 | from graphrag.logger.progress import Progress |
32 | 33 | from graphrag.storage.factory import StorageFactory |
33 | 34 | from graphrag.storage.pipeline_storage import PipelineStorage |
34 | | -from graphrag.utils.storage import write_table_to_storage |
| 35 | +from graphrag.utils.storage import load_table_from_storage, write_table_to_storage |
35 | 36 |
|
36 | 37 | log = logging.getLogger(__name__) |
37 | 38 |
|
@@ -66,45 +67,49 @@ async def run_pipeline( |
66 | 67 | if is_update_run: |
67 | 68 | progress_logger.info("Running incremental indexing.") |
68 | 69 |
|
69 | | - update_storage_config = config.update_index_output.model_dump() # type: ignore |
70 | | - update_index_storage = StorageFactory().create_storage( |
71 | | - storage_type=update_storage_config["type"], # type: ignore |
72 | | - kwargs=update_storage_config, |
73 | | - ) |
74 | | - |
75 | 70 | delta_dataset = await get_delta_docs(dataset, storage) |
76 | 71 |
|
77 | | - # Fail on empty delta dataset |
| 72 | + # warn on empty delta dataset |
78 | 73 | if delta_dataset.new_inputs.empty: |
79 | | - error_msg = "Incremental Indexing Error: No new documents to process." |
80 | | - raise ValueError(error_msg) |
81 | | - |
82 | | - delta_storage = update_index_storage.child("delta") |
83 | | - |
84 | | - # Run the pipeline on the new documents |
85 | | - tables_dict = {} |
86 | | - async for table in _run_pipeline( |
87 | | - pipeline=pipeline, |
88 | | - config=config, |
89 | | - dataset=delta_dataset.new_inputs, |
90 | | - cache=cache, |
91 | | - storage=delta_storage, |
92 | | - callbacks=callback_chain, |
93 | | - logger=progress_logger, |
94 | | - ): |
95 | | - tables_dict[table.workflow] = table.result |
96 | | - |
97 | | - progress_logger.success("Finished running workflows on new documents.") |
98 | | - |
99 | | - await update_dataframe_outputs( |
100 | | - dataframe_dict=tables_dict, |
101 | | - storage=storage, |
102 | | - update_storage=update_index_storage, |
103 | | - config=config, |
104 | | - cache=cache, |
105 | | - callbacks=NoopWorkflowCallbacks(), |
106 | | - progress_logger=progress_logger, |
107 | | - ) |
| 74 | + warning_msg = "Incremental indexing found no new documents, exiting." |
| 75 | + progress_logger.warning(warning_msg) |
| 76 | + else: |
| 77 | + update_storage_config = config.update_index_output.model_dump() # type: ignore |
| 78 | + update_storage = StorageFactory().create_storage( |
| 79 | + storage_type=update_storage_config["type"], # type: ignore |
| 80 | + kwargs=update_storage_config, |
| 81 | + ) |
| 82 | + # we use this to store the new subset index, and will merge its content with the previous index |
| 83 | + timestamped_storage = update_storage.child(time.strftime("%Y%m%d-%H%M%S")) |
| 84 | + delta_storage = timestamped_storage.child("delta") |
| 85 | + # copy the previous output to a backup folder, so we can replace it with the update |
| 86 | + # we'll read from this later when we merge the old and new indexes |
| 87 | + previous_storage = timestamped_storage.child("previous") |
| 88 | + await _copy_previous_output(storage, previous_storage) |
| 89 | + |
| 90 | + # Run the pipeline on the new documents |
| 91 | + async for table in _run_pipeline( |
| 92 | + pipeline=pipeline, |
| 93 | + config=config, |
| 94 | + dataset=delta_dataset.new_inputs, |
| 95 | + cache=cache, |
| 96 | + storage=delta_storage, |
| 97 | + callbacks=callback_chain, |
| 98 | + logger=progress_logger, |
| 99 | + ): |
| 100 | + yield table |
| 101 | + |
| 102 | + progress_logger.success("Finished running workflows on new documents.") |
| 103 | + |
| 104 | + await update_dataframe_outputs( |
| 105 | + previous_storage=previous_storage, |
| 106 | + delta_storage=delta_storage, |
| 107 | + output_storage=storage, |
| 108 | + config=config, |
| 109 | + cache=cache, |
| 110 | + callbacks=NoopWorkflowCallbacks(), |
| 111 | + progress_logger=progress_logger, |
| 112 | + ) |
108 | 113 |
|
109 | 114 | else: |
110 | 115 | progress_logger.info("Running standard indexing.") |
@@ -172,3 +177,13 @@ async def _dump_stats(stats: PipelineRunStats, storage: PipelineStorage) -> None |
172 | 177 | await storage.set( |
173 | 178 | "stats.json", json.dumps(asdict(stats), indent=4, ensure_ascii=False) |
174 | 179 | ) |
| 180 | + |
| 181 | + |
| 182 | +async def _copy_previous_output( |
| 183 | + storage: PipelineStorage, |
| 184 | + copy_storage: PipelineStorage, |
| 185 | +): |
| 186 | + for file in storage.find(re.compile(r"\.parquet$")): |
| 187 | + base_name = file[0].replace(".parquet", "") |
| 188 | + table = await load_table_from_storage(base_name, storage) |
| 189 | + await write_table_to_storage(table, base_name, copy_storage) |
0 commit comments