Move metadata prepending to a util

natoverse · natoverse · commit 247547f5bcfe · 2025-12-22T11:10:39.000-08:00
diff --git a/packages/graphrag/graphrag/chunking/chunker.py b/packages/graphrag/graphrag/chunking/chunker.py
@@ -15,5 +15,5 @@ def __init__(self, **kwargs: Any) -> None:
         """Create a chunker instance."""
 
     @abstractmethod
-    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
+    def chunk(self, text: str) -> list[str]:
         """Chunk method definition."""
diff --git a/packages/graphrag/graphrag/chunking/chunking_config.py b/packages/graphrag/graphrag/chunking/chunking_config.py
@@ -32,5 +32,5 @@ class ChunkingConfig(BaseModel):
     )
     prepend_metadata: bool | None = Field(
         description="Prepend metadata into each chunk.",
-        default=None,
+        default=False,
     )
diff --git a/packages/graphrag/graphrag/chunking/prepend_metadata.py b/packages/graphrag/graphrag/chunking/prepend_metadata.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""A module containing 'prepend_metadata' function."""
+
+
+def prepend_metadata(
+    text: str, metadata: dict, delimiter: str = ": ", line_delimiter: str = "\n"
+) -> str:
+    """Prepend metadata to the given text. This utility writes the dict as rows of key/value pairs."""
+    metadata_str = (
+        line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
+        + line_delimiter
+    )
+    return metadata_str + text
diff --git a/packages/graphrag/graphrag/chunking/sentence_chunker.py b/packages/graphrag/graphrag/chunking/sentence_chunker.py
@@ -14,16 +14,10 @@
 class SentenceChunker(Chunker):
     """A chunker that splits text into sentence-based chunks."""
 
-    def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
+    def __init__(self, **kwargs: Any) -> None:
         """Create a sentence chunker instance."""
-        self._prepend_metadata = prepend_metadata
         bootstrap()
 
-    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
+    def chunk(self, text) -> list[str]:
         """Chunk the text into sentence-based chunks."""
-        chunks = nltk.sent_tokenize(text)
-
-        if self._prepend_metadata and metadata is not None:
-            metadata_str = ".\n".join(f"{k}: {v}" for k, v in metadata.items()) + ".\n"
-            chunks = [metadata_str + chunk for chunk in chunks]
-        return chunks
+        return nltk.sent_tokenize(text)
diff --git a/packages/graphrag/graphrag/chunking/token_chunker.py b/packages/graphrag/graphrag/chunking/token_chunker.py
@@ -23,31 +23,23 @@ def __init__(
         size: int,
         overlap: int,
         tokenizer: Tokenizer,
-        prepend_metadata: bool = False,
         **kwargs: Any,
     ) -> None:
         """Create a token chunker instance."""
         self._size = size
         self._overlap = overlap
-        self._prepend_metadata = prepend_metadata
         self._tokenizer = tokenizer
 
-    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
+    def chunk(self, text: str) -> list[str]:
         """Chunk the text into token-based chunks."""
-        chunks = split_text_on_tokens(
+        return split_text_on_tokens(
             text,
             chunk_size=self._size,
             chunk_overlap=self._overlap,
             encode=self._tokenizer.encode,
             decode=self._tokenizer.decode,
         )
 
-        if self._prepend_metadata and metadata is not None:
-            metadata_str = ".\n".join(f"{k}: {v}" for k, v in metadata.items()) + ".\n"
-            chunks = [metadata_str + chunk for chunk in chunks]
-
-        return chunks
-
 
 def split_text_on_tokens(
     text: str,
diff --git a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
@@ -13,6 +13,7 @@
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.chunking.chunker import Chunker
 from graphrag.chunking.chunker_factory import create_chunker
+from graphrag.chunking.prepend_metadata import prepend_metadata as prepend_metadata_fn
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.typing.context import PipelineRunContext
 from graphrag.index.typing.workflow import WorkflowFunctionOutput
@@ -39,6 +40,7 @@ async def run_workflow(
         context.callbacks,
         tokenizer=tokenizer,
         chunker=chunker,
+        prepend_metadata=config.chunks.prepend_metadata,
     )
 
     await write_table_to_storage(output, "text_units", context.output_storage)
@@ -52,6 +54,7 @@ def create_base_text_units(
     callbacks: WorkflowCallbacks,
     tokenizer: Tokenizer,
     chunker: Chunker,
+    prepend_metadata: bool | None = False,
 ) -> pd.DataFrame:
     """All the steps to transform base text_units."""
     documents.sort_values(by=["id"], ascending=[True], inplace=True)
@@ -63,10 +66,12 @@ def create_base_text_units(
     logger.info("Starting chunking process for %d documents", total_rows)
 
     def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
-        metadata = row.get("metadata")
-        if (metadata is not None) and isinstance(metadata, str):
-            metadata = json.loads(metadata)
-        row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
+        row["chunks"] = chunker.chunk(row["text"])
+
+        metadata = row.get("metadata", None)
+        if prepend_metadata and metadata is not None:
+            metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
+            row["chunks"] = [prepend_metadata_fn(chunk, metadata) for chunk in row["chunks"]]
         tick()
         logger.info("chunker progress:  %d/%d", row_index + 1, total_rows)
         return row
diff --git a/tests/unit/chunking/test_chunker.py b/tests/unit/chunking/test_chunker.py
@@ -48,18 +48,6 @@ def test_mixed_whitespace_handling(self):
         assert chunks[0] == "   Sentence with spaces."
         assert chunks[1] == "Another one!"
 
-    def test_prepend_metadata(self):
-        """Test prepending metadata to chunks"""
-        input = "This is a test. Another sentence."
-        config = ChunkingConfig(
-            strategy=ChunkStrategyType.Sentence, prepend_metadata=True
-        )
-        chunker = create_chunker(config)
-        chunks = chunker.chunk(input, metadata={"message": "hello"})
-
-        assert chunks[0] == "message: hello.\nThis is a test."
-        assert chunks[1] == "message: hello.\nAnother sentence."
-
 
 class TestRunTokens:
     @patch("tiktoken.get_encoding")
@@ -83,20 +71,6 @@ def test_basic_functionality(self, mock_get_encoding):
 
         assert len(chunks) > 0
 
-    def test_prepend_metadata(self):
-        """Test prepending metadata to chunks"""
-        mocked_tokenizer = MockTokenizer()
-        input = "This is a test."
-        config = ChunkingConfig(
-            strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
-        )
-        chunker = create_chunker(config, tokenizer=mocked_tokenizer)
-        chunks = chunker.chunk(input, metadata={"message": "hello"})
-
-        assert chunks[0] == "message: hello.\nThis "
-        assert chunks[1] == "message: hello.\nis a "
-        assert chunks[2] == "message: hello.\ntest."
-
 
 def test_split_text_str_empty():
     result = split_text_on_tokens(
diff --git a/tests/unit/chunking/test_prepend_metadata.py b/tests/unit/chunking/test_prepend_metadata.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+from graphrag.chunking.prepend_metadata import prepend_metadata
+
+
+def test_prepend_metadata_one_row():
+    """Test prepending metadata to chunks"""
+    chunks = ["This is a test.", "Another sentence."]
+    metadata = {"message": "hello"}
+    results = [prepend_metadata(chunk, metadata) for chunk in chunks]
+    assert results[0] == "message: hello\nThis is a test."
+    assert results[1] == "message: hello\nAnother sentence."
+
+
+def test_prepend_metadata_multiple_rows():
+    """Test prepending metadata to chunks"""
+    chunks = ["This is a test.", "Another sentence."]
+    metadata = {"message": "hello", "tag": "first"}
+    results = [prepend_metadata(chunk, metadata) for chunk in chunks]
+    assert results[0] == "message: hello\ntag: first\nThis is a test."
+    assert results[1] == "message: hello\ntag: first\nAnother sentence."
+
+
+def test_prepend_metadata_custom_delimiters():
+    """Test prepending metadata to chunks"""
+    chunks = ["This is a test.", "Another sentence."]
+    metadata = {"message": "hello", "tag": "first"}
+    results = [
+        prepend_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
+        for chunk in chunks
+    ]
+    assert results[0] == "message-hello_tag-first_This is a test."
+    assert results[1] == "message-hello_tag-first_Another sentence."

Original file line number	Diff line number	Diff line change
`@@ -32,5 +32,5 @@ class ChunkingConfig(BaseModel):`
`32`	`32`	`)`
`33`	`33`	`prepend_metadata: bool \| None = Field(`
`34`	`34`	`description="Prepend metadata into each chunk.",`
`35`		`- default=None,`
	`35`	`+ default=False,`
`36`	`36`	`)`