graphrag
diff --git a/‎.semversioner/next-release/patch-20241224192900934104.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20241224192900934104.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎graphrag/config/create_graphrag_config.py‎
Lines changed: 14 additions & 3 deletions b/‎graphrag/config/create_graphrag_config.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎graphrag/config/defaults.py‎
Lines changed: 3 additions & 0 deletions b/‎graphrag/config/defaults.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graphrag/config/init_content.py‎
Lines changed: 2 additions & 2 deletions b/‎graphrag/config/init_content.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphrag/config/models/chunking_config.py‎
Lines changed: 17 additions & 17 deletions b/‎graphrag/config/models/chunking_config.py‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎graphrag/config/models/cluster_graph_config.py‎
Lines changed: 7 additions & 11 deletions b/‎graphrag/config/models/cluster_graph_config.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎graphrag/config/models/embed_graph_config.py‎
Lines changed: 6 additions & 17 deletions b/‎graphrag/config/models/embed_graph_config.py‎
Lines changed: 6 additions & 17 deletions
diff --git a/‎graphrag/config/models/entity_extraction_config.py‎
Lines changed: 0 additions & 2 deletions b/‎graphrag/config/models/entity_extraction_config.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎graphrag/index/create_pipeline_config.py‎
Lines changed: 4 additions & 12 deletions b/‎graphrag/index/create_pipeline_config.py‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎graphrag/index/flows/compute_communities.py‎
Lines changed: 6 additions & 4 deletions b/‎graphrag/index/flows/compute_communities.py‎
Lines changed: 6 additions & 4 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Simplify and streamline internal config."
+}
@@ -31,7 +31,7 @@
 from graphrag.config.input_models.graphrag_config_input import GraphRagConfigInput
 from graphrag.config.input_models.llm_config_input import LLMConfigInput
 from graphrag.config.models.cache_config import CacheConfig
-from graphrag.config.models.chunking_config import ChunkingConfig
+from graphrag.config.models.chunking_config import ChunkingConfig, ChunkStrategyType
 from graphrag.config.models.claim_extraction_config import ClaimExtractionConfig
 from graphrag.config.models.cluster_graph_config import ClusterGraphConfig
 from graphrag.config.models.community_reports_config import CommunityReportsConfig
@@ -318,13 +318,16 @@ def hydrate_parallelization_params(
             reader.envvar_prefix(Section.node2vec),
             reader.use(values.get("embed_graph")),
         ):
+            use_lcc = reader.bool("use_lcc")
             embed_graph_model = EmbedGraphConfig(
                 enabled=reader.bool(Fragment.enabled) or defs.NODE2VEC_ENABLED,
+                dimensions=reader.int("dimensions") or defs.NODE2VEC_DIMENSIONS,
                 num_walks=reader.int("num_walks") or defs.NODE2VEC_NUM_WALKS,
                 walk_length=reader.int("walk_length") or defs.NODE2VEC_WALK_LENGTH,
                 window_size=reader.int("window_size") or defs.NODE2VEC_WINDOW_SIZE,
                 iterations=reader.int("iterations") or defs.NODE2VEC_ITERATIONS,
                 random_seed=reader.int("random_seed") or defs.NODE2VEC_RANDOM_SEED,
+                use_lcc=use_lcc if use_lcc is not None else defs.USE_LCC,
             )
         with reader.envvar_prefix(Section.input), reader.use(values.get("input")):
             input_type = reader.str("type")
@@ -412,12 +415,15 @@ def hydrate_parallelization_params(
             encoding_model = (
                 reader.str(Fragment.encoding_model) or global_encoding_model
             )
-
+            strategy = reader.str("strategy")
             chunks_model = ChunkingConfig(
                 size=reader.int("size") or defs.CHUNK_SIZE,
                 overlap=reader.int("overlap") or defs.CHUNK_OVERLAP,
                 group_by_columns=group_by_columns,
                 encoding_model=encoding_model,
+                strategy=ChunkStrategyType(strategy)
+                if strategy
+                else ChunkStrategyType.tokens,
             )
         with (
             reader.envvar_prefix(Section.snapshot),
@@ -522,8 +528,13 @@ def hydrate_parallelization_params(
             )
 
         with reader.use(values.get("cluster_graph")):
+            use_lcc = reader.bool("use_lcc")
+            seed = reader.int("seed")
             cluster_graph_model = ClusterGraphConfig(
-                max_cluster_size=reader.int("max_cluster_size") or defs.MAX_CLUSTER_SIZE
+                max_cluster_size=reader.int("max_cluster_size")
+                or defs.MAX_CLUSTER_SIZE,
+                use_lcc=use_lcc if use_lcc is not None else defs.USE_LCC,
+                seed=seed if seed is not None else defs.CLUSTER_GRAPH_SEED,
             )
 
         with (
 
@@ -60,6 +60,8 @@
 CLAIM_MAX_GLEANINGS = 1
 CLAIM_EXTRACTION_ENABLED = False
 MAX_CLUSTER_SIZE = 10
+USE_LCC = True
+CLUSTER_GRAPH_SEED = 0xDEADBEEF
 COMMUNITY_REPORT_MAX_LENGTH = 2000
 COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
 ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
@@ -74,6 +76,7 @@
 PARALLELIZATION_STAGGER = 0.3
 PARALLELIZATION_NUM_THREADS = 50
 NODE2VEC_ENABLED = False
+NODE2VEC_DIMENSIONS = 1536
 NODE2VEC_NUM_WALKS = 10
 NODE2VEC_WALK_LENGTH = 40
 NODE2VEC_WINDOW_SIZE = 2
 
@@ -12,7 +12,7 @@
 ### LLM settings ###
 ## There are a number of settings to tune the threading and token limits for LLM calls - check the docs.
 
-encoding_model: cl100k_base # this needs to be matched to your model!
+encoding_model: {defs.ENCODING_MODEL} # this needs to be matched to your model!
 
 llm:
   api_key: ${{GRAPHRAG_API_KEY}} # set this in the generated .env file
@@ -111,7 +111,7 @@
   enabled: false # if true, will generate node2vec embeddings for nodes
 
 umap:
-  enabled: false # if true, will generate UMAP embeddings for nodes
+  enabled: false # if true, will generate UMAP embeddings for nodes (embed_graph must also be enabled)
 
 snapshots:
   graphml: false
 
@@ -3,11 +3,24 @@
 
 """Parameterization settings for the default configuration."""
 
+from enum import Enum
+
 from pydantic import BaseModel, Field
 
 import graphrag.config.defaults as defs
 
 
+class ChunkStrategyType(str, Enum):
+    """ChunkStrategy class definition."""
+
+    tokens = "tokens"
+    sentence = "sentence"
+
+    def __repr__(self):
+        """Get a string representation."""
+        return f'"{self.value}"'
+
+
 class ChunkingConfig(BaseModel):
     """Configuration section for chunking."""
 
@@ -19,22 +32,9 @@ class ChunkingConfig(BaseModel):
         description="The chunk by columns to use.",
         default=defs.CHUNK_GROUP_BY_COLUMNS,
     )
-    strategy: dict | None = Field(
-        description="The chunk strategy to use, overriding the default tokenization strategy",
-        default=None,
+    strategy: ChunkStrategyType = Field(
+        description="The chunking strategy to use.", default=ChunkStrategyType.tokens
     )
-    encoding_model: str | None = Field(
-        default=None, description="The encoding model to use."
+    encoding_model: str = Field(
+        description="The encoding model to use.", default=defs.ENCODING_MODEL
     )
-
-    def resolved_strategy(self, encoding_model: str | None) -> dict:
-        """Get the resolved chunking strategy."""
-        from graphrag.index.operations.chunk_text import ChunkStrategyType
-
-        return self.strategy or {
-            "type": ChunkStrategyType.tokens,
-            "chunk_size": self.size,
-            "chunk_overlap": self.overlap,
-            "group_by_columns": self.group_by_columns,
-            "encoding_name": encoding_model or self.encoding_model,
-        }
@@ -14,15 +14,11 @@ class ClusterGraphConfig(BaseModel):
     max_cluster_size: int = Field(
         description="The maximum cluster size to use.", default=defs.MAX_CLUSTER_SIZE
     )
-    strategy: dict | None = Field(
-        description="The cluster strategy to use.", default=None
+    use_lcc: bool = Field(
+        description="Whether to use the largest connected component.",
+        default=defs.USE_LCC,
+    )
+    seed: int | None = Field(
+        description="The seed to use for the clustering.",
+        default=defs.CLUSTER_GRAPH_SEED,
     )
-
-    def resolved_strategy(self) -> dict:
-        """Get the resolved cluster strategy."""
-        from graphrag.index.operations.cluster_graph import GraphCommunityStrategyType
-
-        return self.strategy or {
-            "type": GraphCommunityStrategyType.leiden,
-            "max_cluster_size": self.max_cluster_size,
-        }
@@ -15,6 +15,9 @@ class EmbedGraphConfig(BaseModel):
         description="A flag indicating whether to enable node2vec.",
         default=defs.NODE2VEC_ENABLED,
     )
+    dimensions: int = Field(
+        description="The node2vec vector dimensions.", default=defs.NODE2VEC_DIMENSIONS
+    )
     num_walks: int = Field(
         description="The node2vec number of walks.", default=defs.NODE2VEC_NUM_WALKS
     )
@@ -30,21 +33,7 @@ class EmbedGraphConfig(BaseModel):
     random_seed: int = Field(
         description="The node2vec random seed.", default=defs.NODE2VEC_RANDOM_SEED
     )
-    strategy: dict | None = Field(
-        description="The graph embedding strategy override.", default=None
+    use_lcc: bool = Field(
+        description="Whether to use the largest connected component.",
+        default=defs.USE_LCC,
     )
-
-    def resolved_strategy(self) -> dict:
-        """Get the resolved node2vec strategy."""
-        from graphrag.index.operations.embed_graph.typing import (
-            EmbedGraphStrategyType,
-        )
-
-        return self.strategy or {
-            "type": EmbedGraphStrategyType.node2vec,
-            "num_walks": self.num_walks,
-            "walk_length": self.walk_length,
-            "window_size": self.window_size,
-            "iterations": self.iterations,
-            "random_seed": self.iterations,
-        }
@@ -48,7 +48,5 @@ def resolved_strategy(self, root_dir: str, encoding_model: str | None) -> dict:
             if self.prompt
             else None,
             "max_gleanings": self.max_gleanings,
-            # It's prechunked in create_base_text_units
             "encoding_name": encoding_model or self.encoding_model,
-            "prechunked": True,
         }
@@ -176,13 +176,8 @@ def _text_unit_workflows(
         PipelineWorkflowReference(
             name=create_base_text_units,
             config={
+                "chunks": settings.chunks,
                 "snapshot_transient": settings.snapshots.transient,
-                "chunk_by": settings.chunks.group_by_columns,
-                "text_chunk": {
-                    "strategy": settings.chunks.resolved_strategy(
-                        settings.encoding_model
-                    )
-                },
             },
         ),
         PipelineWorkflowReference(
@@ -243,9 +238,7 @@ def _graph_workflows(settings: GraphRagConfig) -> list[PipelineWorkflowReference
         PipelineWorkflowReference(
             name=compute_communities,
             config={
-                "cluster_graph": {
-                    "strategy": settings.cluster_graph.resolved_strategy()
-                },
+                "cluster_graph": settings.cluster_graph,
                 "snapshot_transient": settings.snapshots.transient,
             },
         ),
@@ -260,9 +253,8 @@ def _graph_workflows(settings: GraphRagConfig) -> list[PipelineWorkflowReference
         PipelineWorkflowReference(
             name=create_final_nodes,
             config={
-                "layout_graph_enabled": settings.umap.enabled,
-                "embed_graph_enabled": settings.embed_graph.enabled,
-                "embed_graph": {"strategy": settings.embed_graph.resolved_strategy()},
+                "layout_enabled": settings.umap.enabled,
+                "embed_graph": settings.embed_graph,
             },
         ),
     ]
 
@@ -3,8 +3,6 @@
 
 """All the steps to create the base entity graph."""
 
-from typing import Any
-
 import pandas as pd
 
 from graphrag.index.operations.cluster_graph import cluster_graph
@@ -13,14 +11,18 @@
 
 def compute_communities(
     base_relationship_edges: pd.DataFrame,
-    clustering_strategy: dict[str, Any],
+    max_cluster_size: int,
+    use_lcc: bool,
+    seed: int | None = None,
 ) -> pd.DataFrame:
     """All the steps to create the base entity graph."""
     graph = create_graph(base_relationship_edges)
 
     communities = cluster_graph(
         graph,
-        strategy=clustering_strategy,
+        max_cluster_size,
+        use_lcc,
+        seed=seed,
     )
 
     base_communities = pd.DataFrame(
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Simplify and streamline internal config."
 +}
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,5 @@ def resolved_strategy(self, root_dir: str, encoding_model: str \| None) -> dict:`
`48`	`48`	`if self.prompt`
`49`	`49`	`else None,`
`50`	`50`	`"max_gleanings": self.max_gleanings,`
`51`		`- # It's prechunked in create_base_text_units`
`52`	`51`	`"encoding_name": encoding_model or self.encoding_model,`
`53`		`- "prechunked": True,`
`54`	`52`	`}`