Revert ChunkingDocument interface

natoverse · natoverse · commit c8dbb029f41a · 2025-12-22T10:17:27.000-08:00
diff --git a/packages/graphrag/graphrag/chunking/chunker.py b/packages/graphrag/graphrag/chunking/chunker.py
@@ -6,18 +6,14 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
-from graphrag.chunking.chunking_document import ChunkingDocument
-
 
 class Chunker(ABC):
-    """Abstract base class for text chunkers."""
+    """Abstract base class for document chunkers."""
 
     @abstractmethod
     def __init__(self, **kwargs: Any) -> None:
         """Create a chunker instance."""
 
     @abstractmethod
-    def chunk(
-        self, document: ChunkingDocument, metadata: dict | None = None
-    ) -> list[str]:
+    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
         """Chunk method definition."""
diff --git a/packages/graphrag/graphrag/chunking/chunking_document.py b/packages/graphrag/graphrag/chunking/chunking_document.py
diff --git a/packages/graphrag/graphrag/chunking/sentence_chunker.py b/packages/graphrag/graphrag/chunking/sentence_chunker.py
@@ -9,7 +9,6 @@
 
 from graphrag.chunking.bootstrap_nltk import bootstrap
 from graphrag.chunking.chunker import Chunker
-from graphrag.chunking.chunking_document import ChunkingDocument
 
 
 class SentenceChunker(Chunker):
@@ -20,11 +19,8 @@ def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
         self._prepend_metadata = prepend_metadata
         bootstrap()
 
-    def chunk(
-        self, document: ChunkingDocument, metadata: dict | None = None
-    ) -> list[str]:
+    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
         """Chunk the text into sentence-based chunks."""
-        text = str(document)
         chunks = nltk.sent_tokenize(text)
 
         if self._prepend_metadata and metadata is not None:
diff --git a/packages/graphrag/graphrag/chunking/text_chunking_document.py b/packages/graphrag/graphrag/chunking/text_chunking_document.py
diff --git a/packages/graphrag/graphrag/chunking/token_chunker.py b/packages/graphrag/graphrag/chunking/token_chunker.py
@@ -9,7 +9,6 @@
 from graphrag_common.types.tokenizer import Tokenizer
 
 from graphrag.chunking.chunker import Chunker
-from graphrag.chunking.chunking_document import ChunkingDocument
 
 EncodedText = list[int]
 DecodeFn = Callable[[EncodedText], str]
@@ -33,13 +32,8 @@ def __init__(
         self._prepend_metadata = prepend_metadata
         self._tokenizer = tokenizer
 
-    def chunk(
-        self, document: ChunkingDocument, metadata: dict | None = None
-    ) -> list[str]:
+    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
         """Chunk the text into token-based chunks."""
-        # we have to create and measure the metadata first to account for the length when chunking
-        text = str(document)
-
         chunks = split_text_on_tokens(
             text,
             chunk_size=self._size,
diff --git a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
@@ -13,7 +13,6 @@
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.chunking.chunker import Chunker
 from graphrag.chunking.chunker_factory import create_chunker
-from graphrag.chunking.text_chunking_document import TextChunkingDocument
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.typing.context import PipelineRunContext
 from graphrag.index.typing.workflow import WorkflowFunctionOutput
@@ -67,8 +66,7 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
         metadata = row.get("metadata")
         if (metadata is not None) and isinstance(metadata, str):
             metadata = json.loads(metadata)
-        document = TextChunkingDocument(text=row["text"])
-        row["chunks"] = chunker.chunk(document, metadata=metadata)
+        row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
         tick()
         logger.info("chunker progress:  %d/%d", row_index + 1, total_rows)
         return row
diff --git a/tests/unit/chunking/test_chunker.py b/tests/unit/chunking/test_chunker.py
@@ -7,7 +7,6 @@
 from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
 from graphrag.chunking.chunker_factory import create_chunker
 from graphrag.chunking.chunking_config import ChunkingConfig
-from graphrag.chunking.text_chunking_document import TextChunkingDocument
 from graphrag.chunking.token_chunker import (
     split_text_on_tokens,
 )
@@ -32,7 +31,7 @@ def setup_method(self, method):
 
     def test_basic_functionality(self):
         """Test basic sentence splitting without metadata"""
-        input = TextChunkingDocument(text="This is a test. Another sentence.")
+        input = "This is a test. Another sentence."
         chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
         chunks = chunker.chunk(input)
 
@@ -43,15 +42,15 @@ def test_basic_functionality(self):
     def test_mixed_whitespace_handling(self):
         """Test input with irregular whitespace"""
 
-        input = TextChunkingDocument(text="   Sentence with spaces. Another one!   ")
+        input = "   Sentence with spaces. Another one!   "
         chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
         chunks = chunker.chunk(input)
         assert chunks[0] == "   Sentence with spaces."
         assert chunks[1] == "Another one!"
 
     def test_prepend_metadata(self):
         """Test prepending metadata to chunks"""
-        input = TextChunkingDocument(text="This is a test. Another sentence.")
+        input = "This is a test. Another sentence."
         config = ChunkingConfig(
             strategy=ChunkStrategyType.Sentence, prepend_metadata=True
         )
@@ -70,9 +69,8 @@ def test_basic_functionality(self, mock_get_encoding):
         mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
         mock_get_encoding.return_value = mock_encoder
 
-        input = TextChunkingDocument(
-            text="Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
-        )
+        input = "Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
+
         config = ChunkingConfig(
             size=5,
             overlap=1,
@@ -88,7 +86,7 @@ def test_basic_functionality(self, mock_get_encoding):
     def test_prepend_metadata(self):
         """Test prepending metadata to chunks"""
         mocked_tokenizer = MockTokenizer()
-        input = TextChunkingDocument(text="This is a test.")
+        input = "This is a test."
         config = ChunkingConfig(
             strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
         )