Streamline config

natoverse · natoverse · commit 9aa94dfd868c · 2025-12-18T17:15:04.000-08:00
diff --git a/packages/graphrag/graphrag/chunking/chunk_strategy_type.py b/packages/graphrag/graphrag/chunking/chunk_strategy_type.py
@@ -3,15 +3,11 @@
 
 """Chunk strategy type enumeration."""
 
-from enum import Enum
+from enum import StrEnum
 
 
-class ChunkStrategyType(str, Enum):
+class ChunkStrategyType(StrEnum):
     """ChunkStrategy class definition."""
 
-    tokens = "tokens"
-    sentence = "sentence"
-
-    def __repr__(self):
-        """Get a string representation."""
-        return f'"{self.value}"'
+    Tokens = "tokens"
+    Sentence = "sentence"
diff --git a/packages/graphrag/graphrag/chunking/chunker_factory.py b/packages/graphrag/graphrag/chunking/chunker_factory.py
@@ -59,14 +59,14 @@ def create_chunker(
 
     if chunker_strategy not in chunker_factory:
         match chunker_strategy:
-            case ChunkStrategyType.tokens:
+            case ChunkStrategyType.Tokens:
                 from graphrag.chunking.token_chunker import TokenChunker
 
-                register_chunker(ChunkStrategyType.tokens, TokenChunker)
-            case ChunkStrategyType.sentence:
+                register_chunker(ChunkStrategyType.Tokens, TokenChunker)
+            case ChunkStrategyType.Sentence:
                 from graphrag.chunking.sentence_chunker import SentenceChunker
 
-                register_chunker(ChunkStrategyType.sentence, SentenceChunker)
+                register_chunker(ChunkStrategyType.Sentence, SentenceChunker)
             case _:
                 msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
                 raise ValueError(msg)
diff --git a/packages/graphrag/graphrag/chunking/chunking_config.py b/packages/graphrag/graphrag/chunking/chunking_config.py
@@ -3,36 +3,38 @@
 
 """Parameterization settings for the default configuration."""
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
 
 from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
-from graphrag.config.defaults import graphrag_config_defaults
 
 
 class ChunkingConfig(BaseModel):
     """Configuration section for chunking."""
 
+    model_config = ConfigDict(extra="allow")
+    """Allow extra fields to support custom cache implementations."""
+
     strategy: str = Field(
         description="The chunking strategy to use.",
-        default=ChunkStrategyType.tokens,
+        default=ChunkStrategyType.Tokens,
     )
-    size: int = Field(
+    size: int | None = Field(
         description="The chunk size to use.",
-        default=graphrag_config_defaults.chunks.size,
+        default=None,
     )
-    overlap: int = Field(
+    overlap: int | None = Field(
         description="The chunk overlap to use.",
-        default=graphrag_config_defaults.chunks.overlap,
+        default=None,
     )
-    encoding_model: str = Field(
+    encoding_model: str | None = Field(
         description="The encoding model to use.",
-        default=graphrag_config_defaults.chunks.encoding_model,
+        default=None,
     )
-    prepend_metadata: bool = Field(
+    prepend_metadata: bool | None = Field(
         description="Prepend metadata into each chunk.",
-        default=graphrag_config_defaults.chunks.prepend_metadata,
+        default=None,
     )
-    chunk_size_includes_metadata: bool = Field(
+    chunk_size_includes_metadata: bool | None = Field(
         description="Count metadata in max tokens.",
-        default=graphrag_config_defaults.chunks.chunk_size_includes_metadata,
+        default=None,
     )
diff --git a/packages/graphrag/graphrag/chunking/sentence_chunker.py b/packages/graphrag/graphrag/chunking/sentence_chunker.py
@@ -15,7 +15,7 @@
 class SentenceChunker(Chunker):
     """A chunker that splits text into sentence-based chunks."""
 
-    def __init__(self, prepend_metadata: bool, **kwargs: Any) -> None:
+    def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
         """Create a sentence chunker instance."""
         self._prepend_metadata = prepend_metadata
         bootstrap()
diff --git a/packages/graphrag/graphrag/chunking/token_chunker.py b/packages/graphrag/graphrag/chunking/token_chunker.py
@@ -24,9 +24,9 @@ def __init__(
         size: int,
         overlap: int,
         encoding_model: str,
-        prepend_metadata: bool,
-        chunk_size_includes_metadata: bool,
         tokenizer: Tokenizer,
+        prepend_metadata: bool = False,
+        chunk_size_includes_metadata: bool = False,
         **kwargs: Any,
     ) -> None:
         """Create a token chunker instance."""
diff --git a/packages/graphrag/graphrag/config/defaults.py b/packages/graphrag/graphrag/config/defaults.py
@@ -60,9 +60,9 @@ class BasicSearchDefaults:
 class ChunksDefaults:
     """Default values for chunks."""
 
+    strategy: str = ChunkStrategyType.Tokens
     size: int = 1200
     overlap: int = 100
-    strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
     encoding_model: str = ENCODING_MODEL
     prepend_metadata: bool = False
     chunk_size_includes_metadata: bool = False
diff --git a/packages/graphrag/graphrag/config/init_content.py b/packages/graphrag/graphrag/config/init_content.py
@@ -55,8 +55,10 @@
   file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
 
 chunks:
+  strategy: {graphrag_config_defaults.chunks.strategy}
   size: {graphrag_config_defaults.chunks.size}
   overlap: {graphrag_config_defaults.chunks.overlap}
+  encoding_model: {graphrag_config_defaults.chunks.encoding_model}
 
 ### Output/storage settings ###
 ## If blob storage is specified in the following four sections,
diff --git a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
@@ -92,4 +92,3 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
     return cast(
         "pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
     )
-
diff --git a/tests/unit/chunking/test_chunker.py b/tests/unit/chunking/test_chunker.py
@@ -31,7 +31,7 @@ def setup_method(self, method):
     def test_basic_functionality(self):
         """Test basic sentence splitting without metadata"""
         input = "This is a test. Another sentence."
-        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
+        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
         chunks = chunker.chunk(input)
 
         assert len(chunks) == 2
@@ -41,14 +41,14 @@ def test_basic_functionality(self):
     def test_multiple_documents(self):
         """Test processing multiple input documents"""
         input = ["First. Document.", "Second. Doc."]
-        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
+        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
         chunks = [chunk for doc in input for chunk in chunker.chunk(doc)]
         assert len(chunks) == 4
 
     def test_mixed_whitespace_handling(self):
         """Test input with irregular whitespace"""
         input = "   Sentence with spaces. Another one!   "
-        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
+        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
         chunks = chunker.chunk(input)
         assert chunks[0] == "   Sentence with spaces."
         assert chunks[1] == "Another one!"
@@ -67,7 +67,7 @@ def test_basic_functionality(self, mock_get_encoding):
             size=5,
             overlap=1,
             encoding_model="fake-encoding",
-            strategy=ChunkStrategyType.tokens,
+            strategy=ChunkStrategyType.Tokens,
         )
 
         chunker = create_chunker(config, tokenizer=tokenizer)

Original file line number	Diff line number	Diff line change
`@@ -92,4 +92,3 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:`
`92`	`92`	`return cast(`
`93`	`93`	`"pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)`
`94`	`94`	`)`
`95`		`-`