Skip to content

Commit c8dbb02

Browse files
committed
Revert ChunkingDocument interface
1 parent 026474a commit c8dbb02

File tree

7 files changed

+11
-64
lines changed

7 files changed

+11
-64
lines changed

packages/graphrag/graphrag/chunking/chunker.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,14 @@
66
from abc import ABC, abstractmethod
77
from typing import Any
88

9-
from graphrag.chunking.chunking_document import ChunkingDocument
10-
119

1210
class Chunker(ABC):
13-
"""Abstract base class for text chunkers."""
11+
"""Abstract base class for document chunkers."""
1412

1513
@abstractmethod
1614
def __init__(self, **kwargs: Any) -> None:
1715
"""Create a chunker instance."""
1816

1917
@abstractmethod
20-
def chunk(
21-
self, document: ChunkingDocument, metadata: dict | None = None
22-
) -> list[str]:
18+
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
2319
"""Chunk method definition."""

packages/graphrag/graphrag/chunking/chunking_document.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

packages/graphrag/graphrag/chunking/sentence_chunker.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
from graphrag.chunking.bootstrap_nltk import bootstrap
1111
from graphrag.chunking.chunker import Chunker
12-
from graphrag.chunking.chunking_document import ChunkingDocument
1312

1413

1514
class SentenceChunker(Chunker):
@@ -20,11 +19,8 @@ def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
2019
self._prepend_metadata = prepend_metadata
2120
bootstrap()
2221

23-
def chunk(
24-
self, document: ChunkingDocument, metadata: dict | None = None
25-
) -> list[str]:
22+
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
2623
"""Chunk the text into sentence-based chunks."""
27-
text = str(document)
2824
chunks = nltk.sent_tokenize(text)
2925

3026
if self._prepend_metadata and metadata is not None:

packages/graphrag/graphrag/chunking/text_chunking_document.py

Lines changed: 0 additions & 20 deletions
This file was deleted.

packages/graphrag/graphrag/chunking/token_chunker.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from graphrag_common.types.tokenizer import Tokenizer
1010

1111
from graphrag.chunking.chunker import Chunker
12-
from graphrag.chunking.chunking_document import ChunkingDocument
1312

1413
EncodedText = list[int]
1514
DecodeFn = Callable[[EncodedText], str]
@@ -33,13 +32,8 @@ def __init__(
3332
self._prepend_metadata = prepend_metadata
3433
self._tokenizer = tokenizer
3534

36-
def chunk(
37-
self, document: ChunkingDocument, metadata: dict | None = None
38-
) -> list[str]:
35+
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
3936
"""Chunk the text into token-based chunks."""
40-
# we have to create and measure the metadata first to account for the length when chunking
41-
text = str(document)
42-
4337
chunks = split_text_on_tokens(
4438
text,
4539
chunk_size=self._size,

packages/graphrag/graphrag/index/workflows/create_base_text_units.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1414
from graphrag.chunking.chunker import Chunker
1515
from graphrag.chunking.chunker_factory import create_chunker
16-
from graphrag.chunking.text_chunking_document import TextChunkingDocument
1716
from graphrag.config.models.graph_rag_config import GraphRagConfig
1817
from graphrag.index.typing.context import PipelineRunContext
1918
from graphrag.index.typing.workflow import WorkflowFunctionOutput
@@ -67,8 +66,7 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
6766
metadata = row.get("metadata")
6867
if (metadata is not None) and isinstance(metadata, str):
6968
metadata = json.loads(metadata)
70-
document = TextChunkingDocument(text=row["text"])
71-
row["chunks"] = chunker.chunk(document, metadata=metadata)
69+
row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
7270
tick()
7371
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
7472
return row

tests/unit/chunking/test_chunker.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
88
from graphrag.chunking.chunker_factory import create_chunker
99
from graphrag.chunking.chunking_config import ChunkingConfig
10-
from graphrag.chunking.text_chunking_document import TextChunkingDocument
1110
from graphrag.chunking.token_chunker import (
1211
split_text_on_tokens,
1312
)
@@ -32,7 +31,7 @@ def setup_method(self, method):
3231

3332
def test_basic_functionality(self):
3433
"""Test basic sentence splitting without metadata"""
35-
input = TextChunkingDocument(text="This is a test. Another sentence.")
34+
input = "This is a test. Another sentence."
3635
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
3736
chunks = chunker.chunk(input)
3837

@@ -43,15 +42,15 @@ def test_basic_functionality(self):
4342
def test_mixed_whitespace_handling(self):
4443
"""Test input with irregular whitespace"""
4544

46-
input = TextChunkingDocument(text=" Sentence with spaces. Another one! ")
45+
input = " Sentence with spaces. Another one! "
4746
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
4847
chunks = chunker.chunk(input)
4948
assert chunks[0] == " Sentence with spaces."
5049
assert chunks[1] == "Another one!"
5150

5251
def test_prepend_metadata(self):
5352
"""Test prepending metadata to chunks"""
54-
input = TextChunkingDocument(text="This is a test. Another sentence.")
53+
input = "This is a test. Another sentence."
5554
config = ChunkingConfig(
5655
strategy=ChunkStrategyType.Sentence, prepend_metadata=True
5756
)
@@ -70,9 +69,8 @@ def test_basic_functionality(self, mock_get_encoding):
7069
mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
7170
mock_get_encoding.return_value = mock_encoder
7271

73-
input = TextChunkingDocument(
74-
text="Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
75-
)
72+
input = "Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
73+
7674
config = ChunkingConfig(
7775
size=5,
7876
overlap=1,
@@ -88,7 +86,7 @@ def test_basic_functionality(self, mock_get_encoding):
8886
def test_prepend_metadata(self):
8987
"""Test prepending metadata to chunks"""
9088
mocked_tokenizer = MockTokenizer()
91-
input = TextChunkingDocument(text="This is a test.")
89+
input = "This is a test."
9290
config = ChunkingConfig(
9391
strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
9492
)

0 commit comments

Comments
 (0)