Skip to content

Commit e014f64

Browse files
committed
Fix usage of get_tokenizer.
1 parent 49ed82a commit e014f64

File tree

16 files changed

+38
-32
lines changed

16 files changed

+38
-32
lines changed

graphrag/index/text_splitting/text_splitting.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,12 @@ class TokenTextSplitter(TextSplitter):
8585

8686
def __init__(
8787
self,
88-
tokenizer: Tokenizer = get_tokenizer(),
88+
tokenizer: Tokenizer | None = None,
8989
**kwargs: Any,
9090
):
9191
"""Init method definition."""
9292
super().__init__(**kwargs)
93-
self._tokenizer = tokenizer
93+
self._tokenizer = tokenizer or get_tokenizer()
9494

9595
def num_tokens(self, text: str) -> int:
9696
"""Return the number of tokens in a string."""

graphrag/prompt_tune/generator/extract_graph_prompt.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def create_extract_graph_prompt(
2424
examples: list[str],
2525
language: str,
2626
max_token_count: int,
27-
tokenizer: Tokenizer = get_tokenizer(),
27+
tokenizer: Tokenizer | None = None,
2828
json_mode: bool = False,
2929
output_path: Path | None = None,
3030
min_examples_required: int = 2,
@@ -56,6 +56,8 @@ def create_extract_graph_prompt(
5656
if isinstance(entity_types, list):
5757
entity_types = ", ".join(map(str, entity_types))
5858

59+
tokenizer = tokenizer or get_tokenizer()
60+
5961
tokens_left = (
6062
max_token_count
6163
- tokenizer.num_tokens(prompt)

graphrag/query/context_builder/community_context.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
def build_community_context(
2525
community_reports: list[CommunityReport],
2626
entities: list[Entity] | None = None,
27-
tokenizer: Tokenizer = get_tokenizer(),
27+
tokenizer: Tokenizer | None = None,
2828
use_community_summary: bool = True,
2929
column_delimiter: str = "|",
3030
shuffle_data: bool = True,
@@ -46,6 +46,7 @@ def build_community_context(
4646
4747
The calculated weight is added as an attribute to the community reports and added to the context data table.
4848
"""
49+
tokenizer = tokenizer or get_tokenizer()
4950

5051
def _is_included(report: CommunityReport) -> bool:
5152
return report.rank is not None and report.rank >= min_community_rank

graphrag/query/context_builder/conversation_history.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def get_user_turns(self, max_user_turns: int | None = 1) -> list[str]:
148148

149149
def build_context(
150150
self,
151-
tokenizer: Tokenizer = get_tokenizer(),
151+
tokenizer: Tokenizer | None = None,
152152
include_user_turns_only: bool = True,
153153
max_qa_turns: int | None = 5,
154154
max_context_tokens: int = 8000,
@@ -168,6 +168,7 @@ def build_context(
168168
context_name: Name of the context, default is "Conversation History".
169169
170170
"""
171+
tokenizer = tokenizer or get_tokenizer()
171172
qa_turns = self.to_qa_turns()
172173
if include_user_turns_only:
173174
qa_turns = [

graphrag/query/context_builder/local_context.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,16 @@
2929

3030
def build_entity_context(
3131
selected_entities: list[Entity],
32-
tokenizer: Tokenizer = get_tokenizer(),
32+
tokenizer: Tokenizer | None = None,
3333
max_context_tokens: int = 8000,
3434
include_entity_rank: bool = True,
3535
rank_description: str = "number of relationships",
3636
column_delimiter: str = "|",
3737
context_name="Entities",
3838
) -> tuple[str, pd.DataFrame]:
3939
"""Prepare entity data table as context data for system prompt."""
40+
tokenizer = tokenizer or get_tokenizer()
41+
4042
if len(selected_entities) == 0:
4143
return "", pd.DataFrame()
4244

@@ -91,12 +93,13 @@ def build_entity_context(
9193
def build_covariates_context(
9294
selected_entities: list[Entity],
9395
covariates: list[Covariate],
94-
tokenizer: Tokenizer = get_tokenizer(),
96+
tokenizer: Tokenizer | None = None,
9597
max_context_tokens: int = 8000,
9698
column_delimiter: str = "|",
9799
context_name: str = "Covariates",
98100
) -> tuple[str, pd.DataFrame]:
99101
"""Prepare covariate data tables as context data for system prompt."""
102+
tokenizer = tokenizer or get_tokenizer()
100103
# create an empty list of covariates
101104
if len(selected_entities) == 0 or len(covariates) == 0:
102105
return "", pd.DataFrame()
@@ -155,7 +158,7 @@ def build_covariates_context(
155158
def build_relationship_context(
156159
selected_entities: list[Entity],
157160
relationships: list[Relationship],
158-
tokenizer: Tokenizer = get_tokenizer(),
161+
tokenizer: Tokenizer | None = None,
159162
include_relationship_weight: bool = False,
160163
max_context_tokens: int = 8000,
161164
top_k_relationships: int = 10,
@@ -164,6 +167,7 @@ def build_relationship_context(
164167
context_name: str = "Relationships",
165168
) -> tuple[str, pd.DataFrame]:
166169
"""Prepare relationship data tables as context data for system prompt."""
170+
tokenizer = tokenizer or get_tokenizer()
167171
selected_relationships = _filter_relationships(
168172
selected_entities=selected_entities,
169173
relationships=relationships,

graphrag/query/context_builder/source_context.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020

2121
def build_text_unit_context(
2222
text_units: list[TextUnit],
23-
tokenizer: Tokenizer = get_tokenizer(),
23+
tokenizer: Tokenizer | None = None,
2424
column_delimiter: str = "|",
2525
shuffle_data: bool = True,
2626
max_context_tokens: int = 8000,
2727
context_name: str = "Sources",
2828
random_state: int = 86,
2929
) -> tuple[str, dict[str, pd.DataFrame]]:
3030
"""Prepare text-unit data table as context data for system prompt."""
31+
tokenizer = tokenizer or get_tokenizer()
3132
if text_units is None or len(text_units) == 0:
3233
return ("", {})
3334

graphrag/query/structured_search/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,13 @@ def __init__(
5959
self,
6060
model: ChatModel,
6161
context_builder: T,
62-
tokenizer: Tokenizer = get_tokenizer(),
62+
tokenizer: Tokenizer | None = None,
6363
model_params: dict[str, Any] | None = None,
6464
context_builder_params: dict[str, Any] | None = None,
6565
):
6666
self.model = model
6767
self.context_builder = context_builder
68-
self.tokenizer = tokenizer
68+
self.tokenizer = tokenizer or get_tokenizer()
6969
self.model_params = model_params or {}
7070
self.context_builder_params = context_builder_params or {}
7171

graphrag/query/structured_search/basic_search/basic_context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ def __init__(
3030
text_embedder: EmbeddingModel,
3131
text_unit_embeddings: BaseVectorStore,
3232
text_units: list[TextUnit] | None = None,
33-
tokenizer: Tokenizer = get_tokenizer(),
33+
tokenizer: Tokenizer | None = None,
3434
embedding_vectorstore_key: str = "id",
3535
):
3636
self.text_embedder = text_embedder
37-
self.tokenizer = tokenizer
37+
self.tokenizer = tokenizer or get_tokenizer()
3838
self.text_units = text_units
3939
self.text_unit_embeddings = text_unit_embeddings
4040
self.embedding_vectorstore_key = embedding_vectorstore_key

graphrag/query/structured_search/basic_search/search.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from graphrag.query.context_builder.builders import BasicContextBuilder
1717
from graphrag.query.context_builder.conversation_history import ConversationHistory
1818
from graphrag.query.structured_search.base import BaseSearch, SearchResult
19-
from graphrag.tokenizer.get_tokenizer import get_tokenizer
2019
from graphrag.tokenizer.tokenizer import Tokenizer
2120

2221
logger = logging.getLogger(__name__)
@@ -32,7 +31,7 @@ def __init__(
3231
self,
3332
model: ChatModel,
3433
context_builder: BasicContextBuilder,
35-
tokenizer: Tokenizer = get_tokenizer(),
34+
tokenizer: Tokenizer | None = None,
3635
system_prompt: str | None = None,
3736
response_type: str = "multiple paragraphs",
3837
callbacks: list[QueryCallbacks] | None = None,

graphrag/query/structured_search/drift_search/drift_context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747
reports: list[CommunityReport] | None = None,
4848
relationships: list[Relationship] | None = None,
4949
covariates: dict[str, list[Covariate]] | None = None,
50-
tokenizer: Tokenizer = get_tokenizer(),
50+
tokenizer: Tokenizer | None = None,
5151
embedding_vectorstore_key: str = EntityVectorStoreKey.ID,
5252
config: DRIFTSearchConfig | None = None,
5353
local_system_prompt: str | None = None,
@@ -59,7 +59,7 @@ def __init__(
5959
self.config = config or DRIFTSearchConfig()
6060
self.model = model
6161
self.text_embedder = text_embedder
62-
self.tokenizer = tokenizer
62+
self.tokenizer = tokenizer or get_tokenizer()
6363
self.local_system_prompt = local_system_prompt or DRIFT_LOCAL_SYSTEM_PROMPT
6464
self.reduce_system_prompt = reduce_system_prompt or DRIFT_REDUCE_PROMPT
6565

0 commit comments

Comments
 (0)