Skip to content

Commit 9aa94df

Browse files
committed
Streamline config
1 parent 896a48c commit 9aa94df

File tree

9 files changed

+33
-34
lines changed

9 files changed

+33
-34
lines changed

packages/graphrag/graphrag/chunking/chunk_strategy_type.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,11 @@
33

44
"""Chunk strategy type enumeration."""
55

6-
from enum import Enum
6+
from enum import StrEnum
77

88

9-
class ChunkStrategyType(str, Enum):
9+
class ChunkStrategyType(StrEnum):
1010
"""ChunkStrategy class definition."""
1111

12-
tokens = "tokens"
13-
sentence = "sentence"
14-
15-
def __repr__(self):
16-
"""Get a string representation."""
17-
return f'"{self.value}"'
12+
Tokens = "tokens"
13+
Sentence = "sentence"

packages/graphrag/graphrag/chunking/chunker_factory.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,14 @@ def create_chunker(
5959

6060
if chunker_strategy not in chunker_factory:
6161
match chunker_strategy:
62-
case ChunkStrategyType.tokens:
62+
case ChunkStrategyType.Tokens:
6363
from graphrag.chunking.token_chunker import TokenChunker
6464

65-
register_chunker(ChunkStrategyType.tokens, TokenChunker)
66-
case ChunkStrategyType.sentence:
65+
register_chunker(ChunkStrategyType.Tokens, TokenChunker)
66+
case ChunkStrategyType.Sentence:
6767
from graphrag.chunking.sentence_chunker import SentenceChunker
6868

69-
register_chunker(ChunkStrategyType.sentence, SentenceChunker)
69+
register_chunker(ChunkStrategyType.Sentence, SentenceChunker)
7070
case _:
7171
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
7272
raise ValueError(msg)

packages/graphrag/graphrag/chunking/chunking_config.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,36 +3,38 @@
33

44
"""Parameterization settings for the default configuration."""
55

6-
from pydantic import BaseModel, Field
6+
from pydantic import BaseModel, ConfigDict, Field
77

88
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
9-
from graphrag.config.defaults import graphrag_config_defaults
109

1110

1211
class ChunkingConfig(BaseModel):
1312
"""Configuration section for chunking."""
1413

14+
model_config = ConfigDict(extra="allow")
15+
"""Allow extra fields to support custom cache implementations."""
16+
1517
strategy: str = Field(
1618
description="The chunking strategy to use.",
17-
default=ChunkStrategyType.tokens,
19+
default=ChunkStrategyType.Tokens,
1820
)
19-
size: int = Field(
21+
size: int | None = Field(
2022
description="The chunk size to use.",
21-
default=graphrag_config_defaults.chunks.size,
23+
default=None,
2224
)
23-
overlap: int = Field(
25+
overlap: int | None = Field(
2426
description="The chunk overlap to use.",
25-
default=graphrag_config_defaults.chunks.overlap,
27+
default=None,
2628
)
27-
encoding_model: str = Field(
29+
encoding_model: str | None = Field(
2830
description="The encoding model to use.",
29-
default=graphrag_config_defaults.chunks.encoding_model,
31+
default=None,
3032
)
31-
prepend_metadata: bool = Field(
33+
prepend_metadata: bool | None = Field(
3234
description="Prepend metadata into each chunk.",
33-
default=graphrag_config_defaults.chunks.prepend_metadata,
35+
default=None,
3436
)
35-
chunk_size_includes_metadata: bool = Field(
37+
chunk_size_includes_metadata: bool | None = Field(
3638
description="Count metadata in max tokens.",
37-
default=graphrag_config_defaults.chunks.chunk_size_includes_metadata,
39+
default=None,
3840
)

packages/graphrag/graphrag/chunking/sentence_chunker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
class SentenceChunker(Chunker):
1616
"""A chunker that splits text into sentence-based chunks."""
1717

18-
def __init__(self, prepend_metadata: bool, **kwargs: Any) -> None:
18+
def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
1919
"""Create a sentence chunker instance."""
2020
self._prepend_metadata = prepend_metadata
2121
bootstrap()

packages/graphrag/graphrag/chunking/token_chunker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ def __init__(
2424
size: int,
2525
overlap: int,
2626
encoding_model: str,
27-
prepend_metadata: bool,
28-
chunk_size_includes_metadata: bool,
2927
tokenizer: Tokenizer,
28+
prepend_metadata: bool = False,
29+
chunk_size_includes_metadata: bool = False,
3030
**kwargs: Any,
3131
) -> None:
3232
"""Create a token chunker instance."""

packages/graphrag/graphrag/config/defaults.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ class BasicSearchDefaults:
6060
class ChunksDefaults:
6161
"""Default values for chunks."""
6262

63+
strategy: str = ChunkStrategyType.Tokens
6364
size: int = 1200
6465
overlap: int = 100
65-
strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
6666
encoding_model: str = ENCODING_MODEL
6767
prepend_metadata: bool = False
6868
chunk_size_includes_metadata: bool = False

packages/graphrag/graphrag/config/init_content.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@
5555
file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
5656
5757
chunks:
58+
strategy: {graphrag_config_defaults.chunks.strategy}
5859
size: {graphrag_config_defaults.chunks.size}
5960
overlap: {graphrag_config_defaults.chunks.overlap}
61+
encoding_model: {graphrag_config_defaults.chunks.encoding_model}
6062
6163
### Output/storage settings ###
6264
## If blob storage is specified in the following four sections,

packages/graphrag/graphrag/index/workflows/create_base_text_units.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,3 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
9292
return cast(
9393
"pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
9494
)
95-

tests/unit/chunking/test_chunker.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def setup_method(self, method):
3131
def test_basic_functionality(self):
3232
"""Test basic sentence splitting without metadata"""
3333
input = "This is a test. Another sentence."
34-
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
34+
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
3535
chunks = chunker.chunk(input)
3636

3737
assert len(chunks) == 2
@@ -41,14 +41,14 @@ def test_basic_functionality(self):
4141
def test_multiple_documents(self):
4242
"""Test processing multiple input documents"""
4343
input = ["First. Document.", "Second. Doc."]
44-
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
44+
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
4545
chunks = [chunk for doc in input for chunk in chunker.chunk(doc)]
4646
assert len(chunks) == 4
4747

4848
def test_mixed_whitespace_handling(self):
4949
"""Test input with irregular whitespace"""
5050
input = " Sentence with spaces. Another one! "
51-
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
51+
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
5252
chunks = chunker.chunk(input)
5353
assert chunks[0] == " Sentence with spaces."
5454
assert chunks[1] == "Another one!"
@@ -67,7 +67,7 @@ def test_basic_functionality(self, mock_get_encoding):
6767
size=5,
6868
overlap=1,
6969
encoding_model="fake-encoding",
70-
strategy=ChunkStrategyType.tokens,
70+
strategy=ChunkStrategyType.Tokens,
7171
)
7272

7373
chunker = create_chunker(config, tokenizer=tokenizer)

0 commit comments

Comments
 (0)