microsoft
diff --git a/‎tests/verbs/data/communities.parquet‎
1009 Bytes b/‎tests/verbs/data/communities.parquet‎
1009 Bytes
diff --git a/‎tests/verbs/data/community_reports.parquet‎
1.9 KB b/‎tests/verbs/data/community_reports.parquet‎
1.9 KB
diff --git a/‎tests/verbs/data/covariates.parquet‎
-13 Bytes b/‎tests/verbs/data/covariates.parquet‎
-13 Bytes
diff --git a/‎tests/verbs/data/documents.parquet‎
61 Bytes b/‎tests/verbs/data/documents.parquet‎
61 Bytes
diff --git a/‎tests/verbs/data/entities.parquet‎
15 Bytes b/‎tests/verbs/data/entities.parquet‎
15 Bytes
diff --git a/‎tests/verbs/data/relationships.parquet‎
-29 Bytes b/‎tests/verbs/data/relationships.parquet‎
-29 Bytes
diff --git a/‎tests/verbs/data/text_units.parquet‎
-10 Bytes b/‎tests/verbs/data/text_units.parquet‎
-10 Bytes
diff --git a/‎tests/verbs/test_create_base_text_units.py‎
Lines changed: 5 additions & 0 deletions b/‎tests/verbs/test_create_base_text_units.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/verbs/test_create_final_documents.py‎
Lines changed: 10 additions & 6 deletions b/‎tests/verbs/test_create_final_documents.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎tests/verbs/util.py‎
Lines changed: 10 additions & 2 deletions b/‎tests/verbs/util.py‎
Lines changed: 10 additions & 2 deletions
@@ -11,6 +11,7 @@
     compare_outputs,
     create_test_context,
     load_test_table,
+    update_document_metadata,
 )
 
 
@@ -43,6 +44,8 @@ async def test_create_base_text_units_metadata():
     config.input.metadata = ["title"]
     config.chunks.prepend_metadata = True
 
+    await update_document_metadata(config.input.metadata, context)
+
     await run_workflow(
         config,
         context,
@@ -65,6 +68,8 @@ async def test_create_base_text_units_metadata_included_in_chunk():
     config.chunks.prepend_metadata = True
     config.chunks.chunk_size_includes_metadata = True
 
+    await update_document_metadata(config.input.metadata, context)
+
     await run_workflow(
         config,
         context,
 
@@ -13,6 +13,7 @@
     compare_outputs,
     create_test_context,
     load_test_table,
+    update_document_metadata,
 )
 
 
@@ -37,15 +38,18 @@ async def test_create_final_documents():
 
 
 async def test_create_final_documents_with_metadata_column():
-    expected = load_test_table("documents")
-
     context = await create_test_context(
         storage=["text_units"],
     )
 
     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
     config.input.metadata = ["title"]
 
+    # simulate the metadata construction during initial input loading
+    await update_document_metadata(config.input.metadata, context)
+
+    expected = await load_table_from_storage("documents", context.storage)
+
     await run_workflow(
         config,
         context,
@@ -54,12 +58,12 @@ async def test_create_final_documents_with_metadata_column():
 
     actual = await load_table_from_storage("documents", context.storage)
 
-    # we should have dropped "title" and added "attributes"
-    # our test dataframe does not have attributes, so we'll assert without it
+    # our test dataframe does not have metadata, so we'll assert without it
     # and separately confirm it is in the output
     compare_outputs(
-        actual, expected, columns=["id", "human_readable_id", "text", "text_unit_ids"]
+        actual, expected, columns=["id", "human_readable_id", "text", "metadata"]
     )
-    assert len(actual.columns) == 6
+    assert len(actual.columns) == 7
     assert "title" in actual.columns
+    assert "text_unit_ids" in actual.columns
     assert "metadata" in actual.columns
@@ -7,7 +7,7 @@
 import graphrag.config.defaults as defs
 from graphrag.index.context import PipelineRunContext
 from graphrag.index.run.utils import create_run_context
-from graphrag.utils.storage import write_table_to_storage
+from graphrag.utils.storage import load_table_from_storage, write_table_to_storage
 
 pd.set_option("display.max_columns", None)
 
@@ -43,7 +43,6 @@ async def create_test_context(storage: list[str] | None = None) -> PipelineRunCo
     if storage:
         for name in storage:
             table = load_test_table(name)
-            # normal storage interface insists on bytes
             await write_table_to_storage(table, name, context.storage)
 
     return context
@@ -83,3 +82,12 @@ def compare_outputs(
             print("Actual:")
             print(actual[column])
             raise
+
+
+async def update_document_metadata(metadata: list[str], context: PipelineRunContext):
+    """Takes the default documents and adds the configured metadata columns for later parsing by the text units and final documents workflows."""
+    documents = await load_table_from_storage("documents", context.storage)
+    documents["metadata"] = documents[metadata].apply(lambda row: row.to_dict(), axis=1)
+    await write_table_to_storage(
+        documents, "documents", context.storage
+    )  # write to the runtime context storage only