Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 9f77d1c

Browse files
committed
Tokenizers/Tokenizer
1 parent 19c59d9 commit 9f77d1c

File tree

18 files changed

+60
-60
lines changed

18 files changed

+60
-60
lines changed

app/models/embedding_definition.rb

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ def distance_functions
1717

1818
def tokenizer_names
1919
[
20-
DiscourseAi::Tokenizers::AllMpnetBaseV2Tokenizer,
21-
DiscourseAi::Tokenizers::BgeLargeEnTokenizer,
22-
DiscourseAi::Tokenizers::BgeM3Tokenizer,
23-
DiscourseAi::Tokenizers::GeminiTokenizer,
24-
DiscourseAi::Tokenizers::MultilingualE5LargeTokenizer,
25-
DiscourseAi::Tokenizers::OpenAiTokenizer,
26-
DiscourseAi::Tokenizers::MistralTokenizer,
27-
DiscourseAi::Tokenizers::QwenTokenizer,
20+
DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer,
21+
DiscourseAi::Tokenizer::BgeLargeEnTokenizer,
22+
DiscourseAi::Tokenizer::BgeM3Tokenizer,
23+
DiscourseAi::Tokenizer::GeminiTokenizer,
24+
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
25+
DiscourseAi::Tokenizer::OpenAiTokenizer,
26+
DiscourseAi::Tokenizer::MistralTokenizer,
27+
DiscourseAi::Tokenizer::QwenTokenizer,
2828
].map(&:name)
2929
end
3030

@@ -42,7 +42,7 @@ def presets
4242
dimensions: 1024,
4343
max_sequence_length: 512,
4444
pg_function: "<#>",
45-
tokenizer_class: "DiscourseAi::Tokenizers::BgeLargeEnTokenizer",
45+
tokenizer_class: "DiscourseAi::Tokenizer::BgeLargeEnTokenizer",
4646
provider: HUGGING_FACE,
4747
search_prompt: "Represent this sentence for searching relevant passages:",
4848
},
@@ -52,7 +52,7 @@ def presets
5252
dimensions: 1024,
5353
max_sequence_length: 8192,
5454
pg_function: "<#>",
55-
tokenizer_class: "DiscourseAi::Tokenizers::BgeM3Tokenizer",
55+
tokenizer_class: "DiscourseAi::Tokenizer::BgeM3Tokenizer",
5656
provider: HUGGING_FACE,
5757
},
5858
{
@@ -63,7 +63,7 @@ def presets
6363
pg_function: "<=>",
6464
url:
6565
"https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent",
66-
tokenizer_class: "DiscourseAi::Tokenizers::GeminiTokenizer",
66+
tokenizer_class: "DiscourseAi::Tokenizer::GeminiTokenizer",
6767
provider: GOOGLE,
6868
},
6969
{
@@ -72,7 +72,7 @@ def presets
7272
dimensions: 1024,
7373
max_sequence_length: 512,
7474
pg_function: "<=>",
75-
tokenizer_class: "DiscourseAi::Tokenizers::MultilingualE5LargeTokenizer",
75+
tokenizer_class: "DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer",
7676
provider: HUGGING_FACE,
7777
},
7878
# "text-embedding-3-large" real dimentions are 3072, but we only support up to 2000 in the
@@ -83,7 +83,7 @@ def presets
8383
dimensions: 2000,
8484
max_sequence_length: 8191,
8585
pg_function: "<=>",
86-
tokenizer_class: "DiscourseAi::Tokenizers::OpenAiTokenizer",
86+
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
8787
url: "https://api.openai.com/v1/embeddings",
8888
provider: OPEN_AI,
8989
matryoshka_dimensions: true,
@@ -97,7 +97,7 @@ def presets
9797
dimensions: 1536,
9898
max_sequence_length: 8191,
9999
pg_function: "<=>",
100-
tokenizer_class: "DiscourseAi::Tokenizers::OpenAiTokenizer",
100+
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
101101
url: "https://api.openai.com/v1/embeddings",
102102
provider: OPEN_AI,
103103
matryoshka_dimensions: true,
@@ -111,7 +111,7 @@ def presets
111111
dimensions: 1536,
112112
max_sequence_length: 8191,
113113
pg_function: "<=>",
114-
tokenizer_class: "DiscourseAi::Tokenizers::OpenAiTokenizer",
114+
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
115115
url: "https://api.openai.com/v1/embeddings",
116116
provider: OPEN_AI,
117117
provider_params: {

config/eval-llms.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ llms:
22
gpt-4o:
33
display_name: GPT-4o
44
name: gpt-4o
5-
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer
5+
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer
66
api_key_env: OPENAI_API_KEY
77
provider: open_ai
88
url: https://api.openai.com/v1/chat/completions
@@ -12,7 +12,7 @@ llms:
1212
gpt-4o-mini:
1313
display_name: GPT-4o-mini
1414
name: gpt-4o-mini
15-
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer
15+
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer
1616
api_key_env: OPENAI_API_KEY
1717
provider: open_ai
1818
url: https://api.openai.com/v1/chat/completions
@@ -22,7 +22,7 @@ llms:
2222
claude-3.5-haiku:
2323
display_name: Claude 3.5 Haiku
2424
name: claude-3-5-haiku-latest
25-
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
25+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
2626
api_key_env: ANTHROPIC_API_KEY
2727
provider: anthropic
2828
url: https://api.anthropic.com/v1/messages
@@ -32,7 +32,7 @@ llms:
3232
claude-3.5-sonnet:
3333
display_name: Claude 3.5 Sonnet
3434
name: claude-3-5-sonnet-latest
35-
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
35+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
3636
api_key_env: ANTHROPIC_API_KEY
3737
provider: anthropic
3838
url: https://api.anthropic.com/v1/messages
@@ -42,7 +42,7 @@ llms:
4242
claude-3.7-sonnet:
4343
display_name: Claude 3.7 Sonnet
4444
name: claude-3-7-sonnet-latest
45-
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
45+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
4646
api_key_env: ANTHROPIC_API_KEY
4747
provider: anthropic
4848
url: https://api.anthropic.com/v1/messages
@@ -52,7 +52,7 @@ llms:
5252
claude-3.7-sonnet-thinking:
5353
display_name: Claude 3.7 Sonnet
5454
name: claude-3-7-sonnet-latest
55-
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
55+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
5656
api_key_env: ANTHROPIC_API_KEY
5757
provider: anthropic
5858
url: https://api.anthropic.com/v1/messages
@@ -67,7 +67,7 @@ llms:
6767
gemini-2.0-flash:
6868
display_name: Gemini 2.0 Flash
6969
name: gemini-2-0-flash
70-
tokenizer: DiscourseAi::Tokenizers::GeminiTokenizer
70+
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer
7171
api_key_env: GEMINI_API_KEY
7272
provider: google
7373
url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash
@@ -77,7 +77,7 @@ llms:
7777
gemini-2.0-pro-exp:
7878
display_name: Gemini 2.0 pro
7979
name: gemini-2-0-pro-exp
80-
tokenizer: DiscourseAi::Tokenizers::GeminiTokenizer
80+
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer
8181
api_key_env: GEMINI_API_KEY
8282
provider: google
8383
url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-pro-exp

lib/automation/report_context_generator.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def initialize(
2626
@tags = tags
2727
@allow_secure_categories = allow_secure_categories
2828
@max_posts = max_posts
29-
@tokenizer = tokenizer || DiscourseAi::Tokenizers::OpenAiTokenizer
29+
@tokenizer = tokenizer || DiscourseAi::Tokenizer::OpenAiTokenizer
3030
@tokens_per_post = tokens_per_post
3131
@prioritized_group_ids = prioritized_group_ids
3232

lib/completions/dialects/fake.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def can_translate?(llm_model)
1111
end
1212

1313
def tokenizer
14-
DiscourseAi::Tokenizers::OpenAiTokenizer
14+
DiscourseAi::Tokenizer::OpenAiTokenizer
1515
end
1616

1717
def translate

lib/completions/dialects/open_ai_compatible.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def can_translate?(_llm_model)
1212
end
1313

1414
def tokenizer
15-
llm_model&.tokenizer_class || DiscourseAi::Tokenizers::Llama3Tokenizer
15+
llm_model&.tokenizer_class || DiscourseAi::Tokenizer::Llama3Tokenizer
1616
end
1717

1818
def tools

lib/completions/endpoints/canned_response.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def perform_completion!(
7474
end
7575

7676
def tokenizer
77-
DiscourseAi::Tokenizers::OpenAiTokenizer
77+
DiscourseAi::Tokenizer::OpenAiTokenizer
7878
end
7979

8080
private

lib/completions/llm.rb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def presets
5959
output_cost: 75,
6060
},
6161
],
62-
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer,
62+
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer,
6363
endpoint: "https://api.anthropic.com/v1/messages",
6464
provider: "anthropic",
6565
},
@@ -103,7 +103,7 @@ def presets
103103
output_cost: 0.30,
104104
},
105105
],
106-
tokenizer: DiscourseAi::Tokenizers::GeminiTokenizer,
106+
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer,
107107
provider: "google",
108108
},
109109
{
@@ -150,7 +150,7 @@ def presets
150150
output_cost: 0.40,
151151
},
152152
],
153-
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer,
153+
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
154154
endpoint: "https://api.openai.com/v1/chat/completions",
155155
provider: "open_ai",
156156
},
@@ -172,7 +172,7 @@ def presets
172172
output_cost: 0.20,
173173
},
174174
],
175-
tokenizer: DiscourseAi::Tokenizers::Llama3Tokenizer,
175+
tokenizer: DiscourseAi::Tokenizer::Llama3Tokenizer,
176176
endpoint: "https://api.sambanova.ai/v1/chat/completions",
177177
provider: "samba_nova",
178178
},
@@ -190,7 +190,7 @@ def presets
190190
display_name: "Pixtral Large",
191191
},
192192
],
193-
tokenizer: DiscourseAi::Tokenizers::MistralTokenizer,
193+
tokenizer: DiscourseAi::Tokenizer::MistralTokenizer,
194194
endpoint: "https://api.mistral.ai/v1/chat/completions",
195195
provider: "mistral",
196196
},
@@ -217,7 +217,7 @@ def presets
217217
output_cost: 0.25,
218218
},
219219
],
220-
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer,
220+
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
221221
endpoint: "https://openrouter.ai/api/v1/chat/completions",
222222
provider: "open_router",
223223
},
@@ -248,7 +248,7 @@ def provider_names
248248
end
249249

250250
def tokenizer_names
251-
DiscourseAi::Tokenizers::BasicTokenizer.available_llm_tokenizers.map(&:name)
251+
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
252252
end
253253

254254
def valid_provider_models

lib/sentiment/post_classification.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def prepare_text(target)
161161
target.raw
162162
end
163163

164-
DiscourseAi::Tokenizers::BertTokenizer.truncate(
164+
DiscourseAi::Tokenizer::BertTokenizer.truncate(
165165
content,
166166
512,
167167
strict: SiteSetting.ai_strict_token_counting,

plugin.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
gem "tokenizers", "0.5.4"
1212
gem "tiktoken_ruby", "0.0.11.1"
13-
gem "discourse_ai-tokenizers", "0.1.1", require_name: "discourse_ai/tokenizers"
13+
gem "discourse_ai-tokenizers", "0.1.2", require_name: "discourse_ai/tokenizers"
1414
gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this
1515

1616
# we probably want to move all dependencies directly in to the Discourse Gemfile, this

spec/fabricators/embedding_definition_fabricator.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Fabricator(:embedding_definition) do
44
display_name "Multilingual E5 Large"
55
provider "hugging_face"
6-
tokenizer_class "DiscourseAi::Tokenizers::MultilingualE5LargeTokenizer"
6+
tokenizer_class "DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer"
77
api_key "123"
88
url "https://test.com/embeddings"
99
provider_params nil
@@ -16,15 +16,15 @@
1616
display_name "BGE Large EN"
1717
provider "cloudflare"
1818
pg_function "<#>"
19-
tokenizer_class "DiscourseAi::Tokenizers::BgeLargeEnTokenizer"
19+
tokenizer_class "DiscourseAi::Tokenizer::BgeLargeEnTokenizer"
2020
provider_params nil
2121
end
2222

2323
Fabricator(:open_ai_embedding_def, from: :embedding_definition) do
2424
display_name "ADA 002"
2525
provider "open_ai"
2626
url "https://api.openai.com/v1/embeddings"
27-
tokenizer_class "DiscourseAi::Tokenizers::OpenAiTokenizer"
27+
tokenizer_class "DiscourseAi::Tokenizer::OpenAiTokenizer"
2828
provider_params { { model_name: "text-embedding-ada-002" } }
2929
max_sequence_length 8191
3030
dimensions 1536
@@ -35,6 +35,6 @@
3535
provider "google"
3636
dimensions 768
3737
max_sequence_length 1536
38-
tokenizer_class "DiscourseAi::Tokenizers::OpenAiTokenizer"
38+
tokenizer_class "DiscourseAi::Tokenizer::OpenAiTokenizer"
3939
url "https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent"
4040
end

0 commit comments

Comments
 (0)