Skip to content

Commit 90479c0

Browse files
committed
Move Tokenizer back to GR core
1 parent 247547f commit 90479c0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+58
-71
lines changed

packages/graphrag-common/graphrag_common/types/__init__.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

packages/graphrag/graphrag/chunking/chunker_factory.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from collections.abc import Callable
77

88
from graphrag_common.factory.factory import Factory, ServiceScope
9-
from graphrag_common.types.tokenizer import Tokenizer
109

1110
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
1211
from graphrag.chunking.chunker import Chunker
@@ -38,7 +37,9 @@ def register_chunker(
3837

3938

4039
def create_chunker(
41-
config: ChunkingConfig, tokenizer: Tokenizer | None = None
40+
config: ChunkingConfig,
41+
encode: Callable[[str], list[int]] | None,
42+
decode: Callable[[list[int]], str] | None,
4243
) -> Chunker:
4344
"""Create a chunker implementation based on the given configuration.
4445
@@ -53,8 +54,10 @@ def create_chunker(
5354
The created chunker implementation.
5455
"""
5556
config_model = config.model_dump()
56-
if tokenizer is not None:
57-
config_model["tokenizer"] = tokenizer
57+
if encode is not None:
58+
config_model["encode"] = encode
59+
if decode is not None:
60+
config_model["decode"] = decode
5861
chunker_strategy = config.strategy
5962

6063
if chunker_strategy not in chunker_factory:

packages/graphrag/graphrag/chunking/token_chunker.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,8 @@
66
from collections.abc import Callable
77
from typing import Any
88

9-
from graphrag_common.types.tokenizer import Tokenizer
10-
119
from graphrag.chunking.chunker import Chunker
1210

13-
EncodedText = list[int]
14-
DecodeFn = Callable[[EncodedText], str]
15-
EncodeFn = Callable[[str], EncodedText]
16-
1711

1812
class TokenChunker(Chunker):
1913
"""A chunker that splits text into token-based chunks."""
@@ -22,31 +16,33 @@ def __init__(
2216
self,
2317
size: int,
2418
overlap: int,
25-
tokenizer: Tokenizer,
19+
encode: Callable[[str], list[int]],
20+
decode: Callable[[list[int]], str],
2621
**kwargs: Any,
2722
) -> None:
2823
"""Create a token chunker instance."""
2924
self._size = size
3025
self._overlap = overlap
31-
self._tokenizer = tokenizer
26+
self._encode = encode
27+
self._decode = decode
3228

3329
def chunk(self, text: str) -> list[str]:
3430
"""Chunk the text into token-based chunks."""
3531
return split_text_on_tokens(
3632
text,
3733
chunk_size=self._size,
3834
chunk_overlap=self._overlap,
39-
encode=self._tokenizer.encode,
40-
decode=self._tokenizer.decode,
35+
encode=self._encode,
36+
decode=self._decode,
4137
)
4238

4339

4440
def split_text_on_tokens(
4541
text: str,
4642
chunk_size: int,
4743
chunk_overlap: int,
48-
encode: EncodeFn,
49-
decode: DecodeFn,
44+
encode: Callable[[str], list[int]],
45+
decode: Callable[[list[int]], str],
5046
) -> list[str]:
5147
"""Split a single text and return chunks using the tokenizer."""
5248
result = []

packages/graphrag/graphrag/index/operations/embed_text/embed_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77

88
import numpy as np
99
import pandas as pd
10-
from graphrag_common.types.tokenizer import Tokenizer
1110

1211
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1312
from graphrag.index.operations.embed_text.run_embed_text import run_embed_text
1413
from graphrag.language_model.protocol.base import EmbeddingModel
14+
from graphrag.tokenizer.tokenizer import Tokenizer
1515
from graphrag.vector_stores.base import BaseVectorStore, VectorStoreDocument
1616

1717
logger = logging.getLogger(__name__)

packages/graphrag/graphrag/index/operations/embed_text/run_embed_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
from dataclasses import dataclass
99

1010
import numpy as np
11-
from graphrag_common.types.tokenizer import Tokenizer
1211

1312
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1413
from graphrag.chunking.token_chunker import split_text_on_tokens
1514
from graphrag.index.utils.is_null import is_null
1615
from graphrag.language_model.protocol.base import EmbeddingModel
1716
from graphrag.logger.progress import ProgressTicker, progress_ticker
17+
from graphrag.tokenizer.tokenizer import Tokenizer
1818

1919
logger = logging.getLogger(__name__)
2020

packages/graphrag/graphrag/index/operations/summarize_communities/build_mixed_context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
"""A module containing build_mixed_context method definition."""
55

66
import pandas as pd
7-
from graphrag_common.types.tokenizer import Tokenizer
87

98
import graphrag.data_model.schemas as schemas
109
from graphrag.index.operations.summarize_communities.graph_context.sort_context import (
1110
sort_context,
1211
)
12+
from graphrag.tokenizer.tokenizer import Tokenizer
1313

1414

1515
def build_mixed_context(

packages/graphrag/graphrag/index/operations/summarize_communities/graph_context/context_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from typing import cast
88

99
import pandas as pd
10-
from graphrag_common.types.tokenizer import Tokenizer
1110

1211
import graphrag.data_model.schemas as schemas
1312
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
@@ -31,6 +30,7 @@
3130
where_column_equals,
3231
)
3332
from graphrag.logger.progress import progress_iterable
33+
from graphrag.tokenizer.tokenizer import Tokenizer
3434

3535
logger = logging.getLogger(__name__)
3636

packages/graphrag/graphrag/index/operations/summarize_communities/graph_context/sort_context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
"""Sort context by degree in descending order."""
44

55
import pandas as pd
6-
from graphrag_common.types.tokenizer import Tokenizer
76

87
import graphrag.data_model.schemas as schemas
8+
from graphrag.tokenizer.tokenizer import Tokenizer
99

1010

1111
def sort_context(

packages/graphrag/graphrag/index/operations/summarize_communities/summarize_communities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from collections.abc import Callable
88

99
import pandas as pd
10-
from graphrag_common.types.tokenizer import Tokenizer
1110

1211
import graphrag.data_model.schemas as schemas
1312
from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
@@ -27,6 +26,7 @@
2726
from graphrag.index.utils.derive_from_rows import derive_from_rows
2827
from graphrag.language_model.protocol.base import ChatModel
2928
from graphrag.logger.progress import progress_ticker
29+
from graphrag.tokenizer.tokenizer import Tokenizer
3030

3131
logger = logging.getLogger(__name__)
3232

packages/graphrag/graphrag/index/operations/summarize_communities/text_unit_context/context_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from typing import cast
88

99
import pandas as pd
10-
from graphrag_common.types.tokenizer import Tokenizer
1110

1211
import graphrag.data_model.schemas as schemas
1312
from graphrag.index.operations.summarize_communities.build_mixed_context import (
@@ -19,6 +18,7 @@
1918
from graphrag.index.operations.summarize_communities.text_unit_context.sort_context import (
2019
sort_context,
2120
)
21+
from graphrag.tokenizer.tokenizer import Tokenizer
2222

2323
logger = logging.getLogger(__name__)
2424

0 commit comments

Comments
 (0)