Skip to content

Commit 247547f

Browse files
committed
Move metadata prepending to a util
1 parent c8dbb02 commit 247547f

File tree

8 files changed

+65
-51
lines changed

8 files changed

+65
-51
lines changed

packages/graphrag/graphrag/chunking/chunker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@ def __init__(self, **kwargs: Any) -> None:
1515
"""Create a chunker instance."""
1616

1717
@abstractmethod
18-
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
18+
def chunk(self, text: str) -> list[str]:
1919
"""Chunk method definition."""

packages/graphrag/graphrag/chunking/chunking_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,5 @@ class ChunkingConfig(BaseModel):
3232
)
3333
prepend_metadata: bool | None = Field(
3434
description="Prepend metadata into each chunk.",
35-
default=None,
35+
default=False,
3636
)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Copyright (c) 2024 Microsoft Corporation.
2+
# Licensed under the MIT License
3+
4+
"""A module containing 'prepend_metadata' function."""
5+
6+
7+
def prepend_metadata(
8+
text: str, metadata: dict, delimiter: str = ": ", line_delimiter: str = "\n"
9+
) -> str:
10+
"""Prepend metadata to the given text. This utility writes the dict as rows of key/value pairs."""
11+
metadata_str = (
12+
line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
13+
+ line_delimiter
14+
)
15+
return metadata_str + text

packages/graphrag/graphrag/chunking/sentence_chunker.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,10 @@
1414
class SentenceChunker(Chunker):
1515
"""A chunker that splits text into sentence-based chunks."""
1616

17-
def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
17+
def __init__(self, **kwargs: Any) -> None:
1818
"""Create a sentence chunker instance."""
19-
self._prepend_metadata = prepend_metadata
2019
bootstrap()
2120

22-
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
21+
def chunk(self, text) -> list[str]:
2322
"""Chunk the text into sentence-based chunks."""
24-
chunks = nltk.sent_tokenize(text)
25-
26-
if self._prepend_metadata and metadata is not None:
27-
metadata_str = ".\n".join(f"{k}: {v}" for k, v in metadata.items()) + ".\n"
28-
chunks = [metadata_str + chunk for chunk in chunks]
29-
return chunks
23+
return nltk.sent_tokenize(text)

packages/graphrag/graphrag/chunking/token_chunker.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,31 +23,23 @@ def __init__(
2323
size: int,
2424
overlap: int,
2525
tokenizer: Tokenizer,
26-
prepend_metadata: bool = False,
2726
**kwargs: Any,
2827
) -> None:
2928
"""Create a token chunker instance."""
3029
self._size = size
3130
self._overlap = overlap
32-
self._prepend_metadata = prepend_metadata
3331
self._tokenizer = tokenizer
3432

35-
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
33+
def chunk(self, text: str) -> list[str]:
3634
"""Chunk the text into token-based chunks."""
37-
chunks = split_text_on_tokens(
35+
return split_text_on_tokens(
3836
text,
3937
chunk_size=self._size,
4038
chunk_overlap=self._overlap,
4139
encode=self._tokenizer.encode,
4240
decode=self._tokenizer.decode,
4341
)
4442

45-
if self._prepend_metadata and metadata is not None:
46-
metadata_str = ".\n".join(f"{k}: {v}" for k, v in metadata.items()) + ".\n"
47-
chunks = [metadata_str + chunk for chunk in chunks]
48-
49-
return chunks
50-
5143

5244
def split_text_on_tokens(
5345
text: str,

packages/graphrag/graphrag/index/workflows/create_base_text_units.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1414
from graphrag.chunking.chunker import Chunker
1515
from graphrag.chunking.chunker_factory import create_chunker
16+
from graphrag.chunking.prepend_metadata import prepend_metadata as prepend_metadata_fn
1617
from graphrag.config.models.graph_rag_config import GraphRagConfig
1718
from graphrag.index.typing.context import PipelineRunContext
1819
from graphrag.index.typing.workflow import WorkflowFunctionOutput
@@ -39,6 +40,7 @@ async def run_workflow(
3940
context.callbacks,
4041
tokenizer=tokenizer,
4142
chunker=chunker,
43+
prepend_metadata=config.chunks.prepend_metadata,
4244
)
4345

4446
await write_table_to_storage(output, "text_units", context.output_storage)
@@ -52,6 +54,7 @@ def create_base_text_units(
5254
callbacks: WorkflowCallbacks,
5355
tokenizer: Tokenizer,
5456
chunker: Chunker,
57+
prepend_metadata: bool | None = False,
5558
) -> pd.DataFrame:
5659
"""All the steps to transform base text_units."""
5760
documents.sort_values(by=["id"], ascending=[True], inplace=True)
@@ -63,10 +66,12 @@ def create_base_text_units(
6366
logger.info("Starting chunking process for %d documents", total_rows)
6467

6568
def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
66-
metadata = row.get("metadata")
67-
if (metadata is not None) and isinstance(metadata, str):
68-
metadata = json.loads(metadata)
69-
row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
69+
row["chunks"] = chunker.chunk(row["text"])
70+
71+
metadata = row.get("metadata", None)
72+
if prepend_metadata and metadata is not None:
73+
metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
74+
row["chunks"] = [prepend_metadata_fn(chunk, metadata) for chunk in row["chunks"]]
7075
tick()
7176
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
7277
return row

tests/unit/chunking/test_chunker.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,6 @@ def test_mixed_whitespace_handling(self):
4848
assert chunks[0] == " Sentence with spaces."
4949
assert chunks[1] == "Another one!"
5050

51-
def test_prepend_metadata(self):
52-
"""Test prepending metadata to chunks"""
53-
input = "This is a test. Another sentence."
54-
config = ChunkingConfig(
55-
strategy=ChunkStrategyType.Sentence, prepend_metadata=True
56-
)
57-
chunker = create_chunker(config)
58-
chunks = chunker.chunk(input, metadata={"message": "hello"})
59-
60-
assert chunks[0] == "message: hello.\nThis is a test."
61-
assert chunks[1] == "message: hello.\nAnother sentence."
62-
6351

6452
class TestRunTokens:
6553
@patch("tiktoken.get_encoding")
@@ -83,20 +71,6 @@ def test_basic_functionality(self, mock_get_encoding):
8371

8472
assert len(chunks) > 0
8573

86-
def test_prepend_metadata(self):
87-
"""Test prepending metadata to chunks"""
88-
mocked_tokenizer = MockTokenizer()
89-
input = "This is a test."
90-
config = ChunkingConfig(
91-
strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
92-
)
93-
chunker = create_chunker(config, tokenizer=mocked_tokenizer)
94-
chunks = chunker.chunk(input, metadata={"message": "hello"})
95-
96-
assert chunks[0] == "message: hello.\nThis "
97-
assert chunks[1] == "message: hello.\nis a "
98-
assert chunks[2] == "message: hello.\ntest."
99-
10074

10175
def test_split_text_str_empty():
10276
result = split_text_on_tokens(
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright (c) 2024 Microsoft Corporation.
2+
# Licensed under the MIT License
3+
4+
from graphrag.chunking.prepend_metadata import prepend_metadata
5+
6+
7+
def test_prepend_metadata_one_row():
8+
"""Test prepending metadata to chunks"""
9+
chunks = ["This is a test.", "Another sentence."]
10+
metadata = {"message": "hello"}
11+
results = [prepend_metadata(chunk, metadata) for chunk in chunks]
12+
assert results[0] == "message: hello\nThis is a test."
13+
assert results[1] == "message: hello\nAnother sentence."
14+
15+
16+
def test_prepend_metadata_multiple_rows():
17+
"""Test prepending metadata to chunks"""
18+
chunks = ["This is a test.", "Another sentence."]
19+
metadata = {"message": "hello", "tag": "first"}
20+
results = [prepend_metadata(chunk, metadata) for chunk in chunks]
21+
assert results[0] == "message: hello\ntag: first\nThis is a test."
22+
assert results[1] == "message: hello\ntag: first\nAnother sentence."
23+
24+
25+
def test_prepend_metadata_custom_delimiters():
26+
"""Test prepending metadata to chunks"""
27+
chunks = ["This is a test.", "Another sentence."]
28+
metadata = {"message": "hello", "tag": "first"}
29+
results = [
30+
prepend_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
31+
for chunk in chunks
32+
]
33+
assert results[0] == "message-hello_tag-first_This is a test."
34+
assert results[1] == "message-hello_tag-first_Another sentence."

0 commit comments

Comments
 (0)