Update v3/main missing config + functions (#2082)

andresmor-ms · web-flow · commit ebe959a9db75 · 2025-09-30T15:48:40.000-06:00
diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
@@ -6,7 +6,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import ClassVar, Literal
+from typing import ClassVar
 
 from graphrag.config.embeddings import default_embeddings
 from graphrag.config.enums import (
@@ -311,6 +311,8 @@ class LanguageModelDefaults:
     model_supports_json: None = None
     rate_limit_strategy: str | None = "static"
     retry_strategy: str = "native"
+    tokens_per_minute: None = None
+    requests_per_minute: None = None
     max_retries: int = 10
     max_retry_wait: float = 10.0
     concurrent_requests: int = 25
diff --git a/graphrag/index/operations/embed_text/embed_text.py b/graphrag/index/operations/embed_text/embed_text.py
@@ -89,7 +89,7 @@ async def _text_embed_in_memory(
     strategy_exec = load_strategy(strategy_type)
     strategy_config = {**strategy}
 
-    texts: list[str] = input[embed_column].to_numpy().tolist()
+    texts: list[str] = input[embed_column].tolist()
     result = await strategy_exec(texts, callbacks, cache, strategy_config)
 
     return result.embeddings
@@ -151,9 +151,9 @@ async def _text_embed_with_vector_store(
             insert_batch_size,
         )
         batch = input.iloc[insert_batch_size * i : insert_batch_size * (i + 1)]
-        texts: list[str] = batch[embed_column].to_numpy().tolist()
-        titles: list[str] = batch[title].to_numpy().tolist()
-        ids: list[str] = batch[id_column].to_numpy().tolist()
+        texts: list[str] = batch[embed_column].tolist()
+        titles: list[str] = batch[title].tolist()
+        ids: list[str] = batch[id_column].tolist()
         result = await strategy_exec(texts, callbacks, cache, strategy_config)
         if result.embeddings:
             embeddings = [
diff --git a/graphrag/index/utils/graphs.py b/graphrag/index/utils/graphs.py
@@ -58,7 +58,7 @@ def hierarchical_leiden(
     graph: nx.Graph,
     max_cluster_size: int = 10,
     random_seed: int | None = 0xDEADBEEF,
-) -> Any:
+) -> list[gn.HierarchicalCluster]:
     """Run hierarchical leiden on the graph."""
     return gn.hierarchical_leiden(
         edges=_nx_to_edge_list(graph),
@@ -140,7 +140,7 @@ def calculate_root_modularity(
     hcs = hierarchical_leiden(
         graph, max_cluster_size=max_cluster_size, random_seed=random_seed
     )
-    root_clusters = hcs.first_level_hierarchical_clustering()
+    root_clusters = first_level_hierarchical_clustering(hcs)
     return modularity(graph, root_clusters)
 
 
@@ -153,7 +153,7 @@ def calculate_leaf_modularity(
     hcs = hierarchical_leiden(
         graph, max_cluster_size=max_cluster_size, random_seed=random_seed
     )
-    leaf_clusters = hcs.final_level_hierarchical_clustering()
+    leaf_clusters = final_level_hierarchical_clustering(hcs)
     return modularity(graph, leaf_clusters)
 
 
@@ -351,3 +351,32 @@ def get_upper_threshold_by_std(data: list[float] | list[int], std_trim: float) -
     mean = np.mean(data)
     std = np.std(data)
     return cast("float", mean + std_trim * std)
+
+
+def first_level_hierarchical_clustering(
+    hcs: list[gn.HierarchicalCluster],
+) -> dict[Any, int]:
+    """first_level_hierarchical_clustering.
+
+    Returns
+    -------
+    dict[Any, int]
+        The initial leiden algorithm clustering results as a dictionary
+        of node id to community id.
+    """
+    return {entry.node: entry.cluster for entry in hcs if entry.level == 0}
+
+
+def final_level_hierarchical_clustering(
+    hcs: list[gn.HierarchicalCluster],
+) -> dict[Any, int]:
+    """
+    final_level_hierarchical_clustering.
+
+    Returns
+    -------
+    dict[Any, int]
+        The last leiden algorithm clustering results as a dictionary
+        of node id to community id.
+    """
+    return {entry.node: entry.cluster for entry in hcs if entry.is_final_cluster}
diff --git a/graphrag/logger/progress.py b/graphrag/logger/progress.py
@@ -56,9 +56,7 @@ def __call__(self, num_ticks: int = 1) -> None:
                 description=self._description,
             )
             if p.description:
-                logger.info(
-                    "%s%s/%s", p.description, str(p.completed_items), str(p.total_items)
-                )
+                logger.info("%s%s/%s", p.description, p.completed_items, p.total_items)
             self._callback(p)
 
     def done(self) -> None:
diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py
@@ -4,6 +4,7 @@
 """Input loading module."""
 
 import logging
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -27,7 +28,7 @@
 
 def _sample_chunks_from_embeddings(
     text_chunks: pd.DataFrame,
-    embeddings: np.ndarray[float, np.dtype[np.float_]],
+    embeddings: np.ndarray[Any, np.dtype[np.float64]],
     k: int = K,
 ) -> pd.DataFrame:
     """Sample text chunks from embeddings."""
diff --git a/pyproject.toml b/pyproject.toml
@@ -239,6 +239,7 @@ ignore = [
     "PERF203", # Needs restructuring of errors, we should bail-out on first error
     "C901",    # needs refactoring to remove cyclomatic complexity
     "B008", # Needs to restructure our cli params with Typer into constants
+    "ASYNC240", 
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -239,6 +239,7 @@ ignore = [`
`239`	`239`	`"PERF203", # Needs restructuring of errors, we should bail-out on first error`
`240`	`240`	`"C901", # needs refactoring to remove cyclomatic complexity`
`241`	`241`	`"B008", # Needs to restructure our cli params with Typer into constants`
	`242`	`+ "ASYNC240",`
`242`	`243`	`]`
`243`	`244`
`244`	`245`	`[tool.ruff.lint.per-file-ignores]`