microsoft
diff --git a/‎graphrag/index/operations/summarize_communities/build_mixed_context.py‎
Lines changed: 7 additions & 4 deletions b/‎graphrag/index/operations/summarize_communities/build_mixed_context.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎graphrag/index/operations/summarize_communities/graph_context/context_builder.py‎
Lines changed: 29 additions & 11 deletions b/‎graphrag/index/operations/summarize_communities/graph_context/context_builder.py‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎graphrag/index/operations/summarize_communities/graph_context/sort_context.py‎
Lines changed: 14 additions & 6 deletions b/‎graphrag/index/operations/summarize_communities/graph_context/sort_context.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎graphrag/index/operations/summarize_communities/summarize_communities.py‎
Lines changed: 3 additions & 1 deletion b/‎graphrag/index/operations/summarize_communities/summarize_communities.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎graphrag/index/operations/summarize_communities/text_unit_context/context_builder.py‎
Lines changed: 13 additions & 9 deletions b/‎graphrag/index/operations/summarize_communities/text_unit_context/context_builder.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎graphrag/index/operations/summarize_communities/text_unit_context/sort_context.py‎
Lines changed: 3 additions & 2 deletions b/‎graphrag/index/operations/summarize_communities/text_unit_context/sort_context.py‎
Lines changed: 3 additions & 2 deletions
@@ -8,10 +8,12 @@
 from graphrag.index.operations.summarize_communities.graph_context.sort_context import (
     sort_context,
 )
-from graphrag.query.llm.text_utils import num_tokens
+from graphrag.tokenizer.tokenizer import Tokenizer
 
 
-def build_mixed_context(context: list[dict], max_context_tokens: int) -> str:
+def build_mixed_context(
+    context: list[dict], tokenizer: Tokenizer, max_context_tokens: int
+) -> str:
     """
     Build parent context by concatenating all sub-communities' contexts.
 
@@ -45,9 +47,10 @@ def build_mixed_context(context: list[dict], max_context_tokens: int) -> str:
                 remaining_local_context.extend(sorted_context[rid][schemas.ALL_CONTEXT])
             new_context_string = sort_context(
                 local_context=remaining_local_context + final_local_contexts,
+                tokenizer=tokenizer,
                 sub_community_reports=substitute_reports,
             )
-            if num_tokens(new_context_string) <= max_context_tokens:
+            if tokenizer.num_tokens(new_context_string) <= max_context_tokens:
                 exceeded_limit = False
                 context_string = new_context_string
                 break
@@ -63,7 +66,7 @@ def build_mixed_context(context: list[dict], max_context_tokens: int) -> str:
             new_context_string = pd.DataFrame(substitute_reports).to_csv(
                 index=False, sep=","
             )
-            if num_tokens(new_context_string) > max_context_tokens:
+            if tokenizer.num_tokens(new_context_string) > max_context_tokens:
                 break
 
             context_string = new_context_string
 
@@ -30,7 +30,7 @@
     where_column_equals,
 )
 from graphrag.logger.progress import progress_iterable
-from graphrag.query.llm.text_utils import num_tokens
+from graphrag.tokenizer.tokenizer import Tokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -39,6 +39,7 @@ def build_local_context(
     nodes,
     edges,
     claims,
+    tokenizer: Tokenizer,
     callbacks: WorkflowCallbacks,
     max_context_tokens: int = 16_000,
 ):
@@ -49,7 +50,7 @@ def build_local_context(
 
     for level in progress_iterable(levels, callbacks.progress, len(levels)):
         communities_at_level_df = _prepare_reports_at_level(
-            nodes, edges, claims, level, max_context_tokens
+            nodes, edges, claims, tokenizer, level, max_context_tokens
         )
 
         communities_at_level_df.loc[:, schemas.COMMUNITY_LEVEL] = level
@@ -63,6 +64,7 @@ def _prepare_reports_at_level(
     node_df: pd.DataFrame,
     edge_df: pd.DataFrame,
     claim_df: pd.DataFrame | None,
+    tokenizer: Tokenizer,
     level: int,
     max_context_tokens: int = 16_000,
 ) -> pd.DataFrame:
@@ -181,6 +183,7 @@ def _prepare_reports_at_level(
     # Generate community-level context strings using vectorized batch processing
     return parallel_sort_context_batch(
         community_df,
+        tokenizer=tokenizer,
         max_context_tokens=max_context_tokens,
     )
 
@@ -189,6 +192,7 @@ def build_level_context(
     report_df: pd.DataFrame | None,
     community_hierarchy_df: pd.DataFrame,
     local_context_df: pd.DataFrame,
+    tokenizer: Tokenizer,
     level: int,
     max_context_tokens: int,
 ) -> pd.DataFrame:
@@ -219,11 +223,11 @@ def build_level_context(
 
     if report_df is None or report_df.empty:
         invalid_context_df.loc[:, schemas.CONTEXT_STRING] = _sort_and_trim_context(
-            invalid_context_df, max_context_tokens
+            invalid_context_df, tokenizer, max_context_tokens
         )
         invalid_context_df[schemas.CONTEXT_SIZE] = invalid_context_df.loc[
             :, schemas.CONTEXT_STRING
-        ].map(num_tokens)
+        ].map(tokenizer.num_tokens)
         invalid_context_df[schemas.CONTEXT_EXCEED_FLAG] = False
         return union(valid_context_df, invalid_context_df)
 
@@ -237,18 +241,21 @@ def build_level_context(
         invalid_context_df,
         sub_context_df,
         community_hierarchy_df,
+        tokenizer,
         max_context_tokens,
     )
 
     # handle any remaining invalid records that can't be subsituted with sub-community reports
     # this should be rare, but if it happens, we will just trim the local context to fit the limit
     remaining_df = _antijoin_reports(invalid_context_df, community_df)
     remaining_df.loc[:, schemas.CONTEXT_STRING] = _sort_and_trim_context(
-        remaining_df, max_context_tokens
+        remaining_df, tokenizer, max_context_tokens
     )
 
     result = union(valid_context_df, community_df, remaining_df)
-    result[schemas.CONTEXT_SIZE] = result.loc[:, schemas.CONTEXT_STRING].map(num_tokens)
+    result[schemas.CONTEXT_SIZE] = result.loc[:, schemas.CONTEXT_STRING].map(
+        tokenizer.num_tokens
+    )
 
     result[schemas.CONTEXT_EXCEED_FLAG] = False
     return result
@@ -269,19 +276,29 @@ def _antijoin_reports(df: pd.DataFrame, reports: pd.DataFrame) -> pd.DataFrame:
     return antijoin(df, reports, schemas.COMMUNITY_ID)
 
 
-def _sort_and_trim_context(df: pd.DataFrame, max_context_tokens: int) -> pd.Series:
+def _sort_and_trim_context(
+    df: pd.DataFrame, tokenizer: Tokenizer, max_context_tokens: int
+) -> pd.Series:
     """Sort and trim context to fit the limit."""
     series = cast("pd.Series", df[schemas.ALL_CONTEXT])
     return transform_series(
-        series, lambda x: sort_context(x, max_context_tokens=max_context_tokens)
+        series,
+        lambda x: sort_context(
+            x, tokenizer=tokenizer, max_context_tokens=max_context_tokens
+        ),
     )
 
 
-def _build_mixed_context(df: pd.DataFrame, max_context_tokens: int) -> pd.Series:
+def _build_mixed_context(
+    df: pd.DataFrame, tokenizer: Tokenizer, max_context_tokens: int
+) -> pd.Series:
     """Sort and trim context to fit the limit."""
     series = cast("pd.Series", df[schemas.ALL_CONTEXT])
     return transform_series(
-        series, lambda x: build_mixed_context(x, max_context_tokens=max_context_tokens)
+        series,
+        lambda x: build_mixed_context(
+            x, tokenizer, max_context_tokens=max_context_tokens
+        ),
     )
 
 
@@ -303,6 +320,7 @@ def _get_community_df(
     invalid_context_df: pd.DataFrame,
     sub_context_df: pd.DataFrame,
     community_hierarchy_df: pd.DataFrame,
+    tokenizer: Tokenizer,
     max_context_tokens: int,
 ) -> pd.DataFrame:
     """Get community context for each community."""
@@ -338,7 +356,7 @@ def _get_community_df(
         .reset_index()
     )
     community_df[schemas.CONTEXT_STRING] = _build_mixed_context(
-        community_df, max_context_tokens
+        community_df, tokenizer, max_context_tokens
     )
     community_df[schemas.COMMUNITY_LEVEL] = level
     return community_df
@@ -5,11 +5,12 @@
 import pandas as pd
 
 import graphrag.data_model.schemas as schemas
-from graphrag.query.llm.text_utils import num_tokens
+from graphrag.tokenizer.tokenizer import Tokenizer
 
 
 def sort_context(
     local_context: list[dict],
+    tokenizer: Tokenizer,
     sub_community_reports: list[dict] | None = None,
     max_context_tokens: int | None = None,
     node_name_column: str = schemas.TITLE,
@@ -112,7 +113,10 @@ def _get_context_string(
         new_context_string = _get_context_string(
             sorted_nodes, sorted_edges, sorted_claims, sub_community_reports
         )
-        if max_context_tokens and num_tokens(new_context_string) > max_context_tokens:
+        if (
+            max_context_tokens
+            and tokenizer.num_tokens(new_context_string) > max_context_tokens
+        ):
             break
         context_string = new_context_string
 
@@ -122,7 +126,9 @@ def _get_context_string(
     )
 
 
-def parallel_sort_context_batch(community_df, max_context_tokens, parallel=False):
+def parallel_sort_context_batch(
+    community_df, tokenizer: Tokenizer, max_context_tokens, parallel=False
+):
     """Calculate context using parallelization if enabled."""
     if parallel:
         # Use ThreadPoolExecutor for parallel execution
@@ -131,7 +137,9 @@ def parallel_sort_context_batch(community_df, max_context_tokens, parallel=False
         with ThreadPoolExecutor(max_workers=None) as executor:
             context_strings = list(
                 executor.map(
-                    lambda x: sort_context(x, max_context_tokens=max_context_tokens),
+                    lambda x: sort_context(
+                        x, tokenizer, max_context_tokens=max_context_tokens
+                    ),
                     community_df[schemas.ALL_CONTEXT],
                 )
             )
@@ -141,13 +149,13 @@ def parallel_sort_context_batch(community_df, max_context_tokens, parallel=False
         # Assign context strings directly to the DataFrame
         community_df[schemas.CONTEXT_STRING] = community_df[schemas.ALL_CONTEXT].apply(
             lambda context_list: sort_context(
-                context_list, max_context_tokens=max_context_tokens
+                context_list, tokenizer, max_context_tokens=max_context_tokens
             )
         )
 
     # Calculate other columns
     community_df[schemas.CONTEXT_SIZE] = community_df[schemas.CONTEXT_STRING].apply(
-        num_tokens
+        tokenizer.num_tokens
     )
     community_df[schemas.CONTEXT_EXCEED_FLAG] = (
         community_df[schemas.CONTEXT_SIZE] > max_context_tokens
 
@@ -23,6 +23,7 @@
 )
 from graphrag.index.utils.derive_from_rows import derive_from_rows
 from graphrag.logger.progress import progress_ticker
+from graphrag.tokenizer.tokenizer import Tokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -35,6 +36,7 @@ async def summarize_communities(
     callbacks: WorkflowCallbacks,
     cache: PipelineCache,
     strategy: dict,
+    tokenizer: Tokenizer,
     max_input_length: int,
     async_mode: AsyncType = AsyncType.AsyncIO,
     num_threads: int = 4,
@@ -44,7 +46,6 @@ async def summarize_communities(
     tick = progress_ticker(callbacks.progress, len(local_contexts))
     strategy_exec = load_strategy(strategy["type"])
     strategy_config = {**strategy}
-
     community_hierarchy = (
         communities.explode("children")
         .rename({"children": "sub_community"}, axis=1)
@@ -60,6 +61,7 @@ async def summarize_communities(
             community_hierarchy_df=community_hierarchy,
             local_context_df=local_contexts,
             level=level,
+            tokenizer=tokenizer,
             max_context_tokens=max_input_length,
         )
         level_contexts.append(level_context)
 
@@ -18,7 +18,7 @@
 from graphrag.index.operations.summarize_communities.text_unit_context.sort_context import (
     sort_context,
 )
-from graphrag.query.llm.text_utils import num_tokens
+from graphrag.tokenizer.tokenizer import Tokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -27,6 +27,7 @@ def build_local_context(
     community_membership_df: pd.DataFrame,
     text_units_df: pd.DataFrame,
     node_df: pd.DataFrame,
+    tokenizer: Tokenizer,
     max_context_tokens: int = 16000,
 ) -> pd.DataFrame:
     """
@@ -69,10 +70,10 @@ def build_local_context(
         .reset_index()
     )
     context_df[schemas.CONTEXT_STRING] = context_df[schemas.ALL_CONTEXT].apply(
-        lambda x: sort_context(x)
+        lambda x: sort_context(x, tokenizer)
     )
     context_df[schemas.CONTEXT_SIZE] = context_df[schemas.CONTEXT_STRING].apply(
-        lambda x: num_tokens(x)
+        lambda x: tokenizer.num_tokens(x)
     )
     context_df[schemas.CONTEXT_EXCEED_FLAG] = context_df[schemas.CONTEXT_SIZE].apply(
         lambda x: x > max_context_tokens
@@ -86,6 +87,7 @@ def build_level_context(
     community_hierarchy_df: pd.DataFrame,
     local_context_df: pd.DataFrame,
     level: int,
+    tokenizer: Tokenizer,
     max_context_tokens: int = 16000,
 ) -> pd.DataFrame:
     """
@@ -116,10 +118,12 @@ def build_level_context(
 
         invalid_context_df.loc[:, [schemas.CONTEXT_STRING]] = invalid_context_df[
             schemas.ALL_CONTEXT
-        ].apply(lambda x: sort_context(x, max_context_tokens=max_context_tokens))
+        ].apply(
+            lambda x: sort_context(x, tokenizer, max_context_tokens=max_context_tokens)
+        )
         invalid_context_df.loc[:, [schemas.CONTEXT_SIZE]] = invalid_context_df[
             schemas.CONTEXT_STRING
-        ].apply(lambda x: num_tokens(x))
+        ].apply(lambda x: tokenizer.num_tokens(x))
         invalid_context_df.loc[:, [schemas.CONTEXT_EXCEED_FLAG]] = False
 
         return pd.concat([valid_context_df, invalid_context_df])
@@ -199,10 +203,10 @@ def build_level_context(
         .reset_index()
     )
     community_df[schemas.CONTEXT_STRING] = community_df[schemas.ALL_CONTEXT].apply(
-        lambda x: build_mixed_context(x, max_context_tokens)
+        lambda x: build_mixed_context(x, tokenizer, max_context_tokens)
     )
     community_df[schemas.CONTEXT_SIZE] = community_df[schemas.CONTEXT_STRING].apply(
-        lambda x: num_tokens(x)
+        lambda x: tokenizer.num_tokens(x)
     )
     community_df[schemas.CONTEXT_EXCEED_FLAG] = False
     community_df[schemas.COMMUNITY_LEVEL] = level
@@ -220,10 +224,10 @@ def build_level_context(
     )
     remaining_df[schemas.CONTEXT_STRING] = cast(
         "pd.DataFrame", remaining_df[schemas.ALL_CONTEXT]
-    ).apply(lambda x: sort_context(x, max_context_tokens=max_context_tokens))
+    ).apply(lambda x: sort_context(x, tokenizer, max_context_tokens=max_context_tokens))
     remaining_df[schemas.CONTEXT_SIZE] = cast(
         "pd.DataFrame", remaining_df[schemas.CONTEXT_STRING]
-    ).apply(lambda x: num_tokens(x))
+    ).apply(lambda x: tokenizer.num_tokens(x))
     remaining_df[schemas.CONTEXT_EXCEED_FLAG] = False
 
     return cast(
 
@@ -8,7 +8,7 @@
 import pandas as pd
 
 import graphrag.data_model.schemas as schemas
-from graphrag.query.llm.text_utils import num_tokens
+from graphrag.tokenizer.tokenizer import Tokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -57,6 +57,7 @@ def get_context_string(
 
 def sort_context(
     local_context: list[dict],
+    tokenizer: Tokenizer,
     sub_community_reports: list[dict] | None = None,
     max_context_tokens: int | None = None,
 ) -> str:
@@ -73,7 +74,7 @@ def sort_context(
             new_context_string = get_context_string(
                 current_text_units, sub_community_reports
             )
-            if num_tokens(new_context_string) > max_context_tokens:
+            if tokenizer.num_tokens(new_context_string) > max_context_tokens:
                 break
 
             context_string = new_context_string