Skip to content

Commit ebe959a

Browse files
authored
Update v3/main missing config + functions (#2082)
1 parent d7773bd commit ebe959a

File tree

7 files changed

+1910
-1874
lines changed

7 files changed

+1910
-1874
lines changed

graphrag/config/defaults.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from collections.abc import Callable
77
from dataclasses import dataclass, field
88
from pathlib import Path
9-
from typing import ClassVar, Literal
9+
from typing import ClassVar
1010

1111
from graphrag.config.embeddings import default_embeddings
1212
from graphrag.config.enums import (
@@ -311,6 +311,8 @@ class LanguageModelDefaults:
311311
model_supports_json: None = None
312312
rate_limit_strategy: str | None = "static"
313313
retry_strategy: str = "native"
314+
tokens_per_minute: None = None
315+
requests_per_minute: None = None
314316
max_retries: int = 10
315317
max_retry_wait: float = 10.0
316318
concurrent_requests: int = 25

graphrag/index/operations/embed_text/embed_text.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ async def _text_embed_in_memory(
8989
strategy_exec = load_strategy(strategy_type)
9090
strategy_config = {**strategy}
9191

92-
texts: list[str] = input[embed_column].to_numpy().tolist()
92+
texts: list[str] = input[embed_column].tolist()
9393
result = await strategy_exec(texts, callbacks, cache, strategy_config)
9494

9595
return result.embeddings
@@ -151,9 +151,9 @@ async def _text_embed_with_vector_store(
151151
insert_batch_size,
152152
)
153153
batch = input.iloc[insert_batch_size * i : insert_batch_size * (i + 1)]
154-
texts: list[str] = batch[embed_column].to_numpy().tolist()
155-
titles: list[str] = batch[title].to_numpy().tolist()
156-
ids: list[str] = batch[id_column].to_numpy().tolist()
154+
texts: list[str] = batch[embed_column].tolist()
155+
titles: list[str] = batch[title].tolist()
156+
ids: list[str] = batch[id_column].tolist()
157157
result = await strategy_exec(texts, callbacks, cache, strategy_config)
158158
if result.embeddings:
159159
embeddings = [

graphrag/index/utils/graphs.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def hierarchical_leiden(
5858
graph: nx.Graph,
5959
max_cluster_size: int = 10,
6060
random_seed: int | None = 0xDEADBEEF,
61-
) -> Any:
61+
) -> list[gn.HierarchicalCluster]:
6262
"""Run hierarchical leiden on the graph."""
6363
return gn.hierarchical_leiden(
6464
edges=_nx_to_edge_list(graph),
@@ -140,7 +140,7 @@ def calculate_root_modularity(
140140
hcs = hierarchical_leiden(
141141
graph, max_cluster_size=max_cluster_size, random_seed=random_seed
142142
)
143-
root_clusters = hcs.first_level_hierarchical_clustering()
143+
root_clusters = first_level_hierarchical_clustering(hcs)
144144
return modularity(graph, root_clusters)
145145

146146

@@ -153,7 +153,7 @@ def calculate_leaf_modularity(
153153
hcs = hierarchical_leiden(
154154
graph, max_cluster_size=max_cluster_size, random_seed=random_seed
155155
)
156-
leaf_clusters = hcs.final_level_hierarchical_clustering()
156+
leaf_clusters = final_level_hierarchical_clustering(hcs)
157157
return modularity(graph, leaf_clusters)
158158

159159

@@ -351,3 +351,32 @@ def get_upper_threshold_by_std(data: list[float] | list[int], std_trim: float) -
351351
mean = np.mean(data)
352352
std = np.std(data)
353353
return cast("float", mean + std_trim * std)
354+
355+
356+
def first_level_hierarchical_clustering(
357+
hcs: list[gn.HierarchicalCluster],
358+
) -> dict[Any, int]:
359+
"""first_level_hierarchical_clustering.
360+
361+
Returns
362+
-------
363+
dict[Any, int]
364+
The initial leiden algorithm clustering results as a dictionary
365+
of node id to community id.
366+
"""
367+
return {entry.node: entry.cluster for entry in hcs if entry.level == 0}
368+
369+
370+
def final_level_hierarchical_clustering(
371+
hcs: list[gn.HierarchicalCluster],
372+
) -> dict[Any, int]:
373+
"""
374+
final_level_hierarchical_clustering.
375+
376+
Returns
377+
-------
378+
dict[Any, int]
379+
The last leiden algorithm clustering results as a dictionary
380+
of node id to community id.
381+
"""
382+
return {entry.node: entry.cluster for entry in hcs if entry.is_final_cluster}

graphrag/logger/progress.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ def __call__(self, num_ticks: int = 1) -> None:
5656
description=self._description,
5757
)
5858
if p.description:
59-
logger.info(
60-
"%s%s/%s", p.description, str(p.completed_items), str(p.total_items)
61-
)
59+
logger.info("%s%s/%s", p.description, p.completed_items, p.total_items)
6260
self._callback(p)
6361

6462
def done(self) -> None:

graphrag/prompt_tune/loader/input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""Input loading module."""
55

66
import logging
7+
from typing import Any
78

89
import numpy as np
910
import pandas as pd
@@ -27,7 +28,7 @@
2728

2829
def _sample_chunks_from_embeddings(
2930
text_chunks: pd.DataFrame,
30-
embeddings: np.ndarray[float, np.dtype[np.float_]],
31+
embeddings: np.ndarray[Any, np.dtype[np.float64]],
3132
k: int = K,
3233
) -> pd.DataFrame:
3334
"""Sample text chunks from embeddings."""

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ ignore = [
239239
"PERF203", # Needs restructuring of errors, we should bail-out on first error
240240
"C901", # needs refactoring to remove cyclomatic complexity
241241
"B008", # Needs to restructure our cli params with Typer into constants
242+
"ASYNC240",
242243
]
243244

244245
[tool.ruff.lint.per-file-ignores]

0 commit comments

Comments
 (0)