Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 3a6755b

Browse files
committed
DEV: Move tokenizers to a gem
1 parent 40fa527 commit 3a6755b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+144
-5670740
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,4 @@ evals/log
66
evals/cases
77
config/eval-llms.local.yml
88
# this gets rid of search results from ag, ripgrep, etc
9-
tokenizers/
109
public/ai-share/highlight.min.js

app/models/embedding_definition.rb

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ def distance_functions
1717

1818
def tokenizer_names
1919
[
20-
DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer,
21-
DiscourseAi::Tokenizer::BgeLargeEnTokenizer,
22-
DiscourseAi::Tokenizer::BgeM3Tokenizer,
23-
DiscourseAi::Tokenizer::GeminiTokenizer,
24-
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
25-
DiscourseAi::Tokenizer::OpenAiTokenizer,
26-
DiscourseAi::Tokenizer::MixtralTokenizer,
27-
DiscourseAi::Tokenizer::QwenTokenizer,
20+
DiscourseAi::Tokenizers::AllMpnetBaseV2Tokenizer,
21+
DiscourseAi::Tokenizers::BgeLargeEnTokenizer,
22+
DiscourseAi::Tokenizers::BgeM3Tokenizer,
23+
DiscourseAi::Tokenizers::GeminiTokenizer,
24+
DiscourseAi::Tokenizers::MultilingualE5LargeTokenizer,
25+
DiscourseAi::Tokenizers::OpenAiTokenizer,
26+
DiscourseAi::Tokenizers::MistralTokenizer,
27+
DiscourseAi::Tokenizers::QwenTokenizer,
2828
].map(&:name)
2929
end
3030

@@ -42,7 +42,7 @@ def presets
4242
dimensions: 1024,
4343
max_sequence_length: 512,
4444
pg_function: "<#>",
45-
tokenizer_class: "DiscourseAi::Tokenizer::BgeLargeEnTokenizer",
45+
tokenizer_class: "DiscourseAi::Tokenizers::BgeLargeEnTokenizer",
4646
provider: HUGGING_FACE,
4747
search_prompt: "Represent this sentence for searching relevant passages:",
4848
},
@@ -52,7 +52,7 @@ def presets
5252
dimensions: 1024,
5353
max_sequence_length: 8192,
5454
pg_function: "<#>",
55-
tokenizer_class: "DiscourseAi::Tokenizer::BgeM3Tokenizer",
55+
tokenizer_class: "DiscourseAi::Tokenizers::BgeM3Tokenizer",
5656
provider: HUGGING_FACE,
5757
},
5858
{
@@ -63,7 +63,7 @@ def presets
6363
pg_function: "<=>",
6464
url:
6565
"https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent",
66-
tokenizer_class: "DiscourseAi::Tokenizer::GeminiTokenizer",
66+
tokenizer_class: "DiscourseAi::Tokenizers::GeminiTokenizer",
6767
provider: GOOGLE,
6868
},
6969
{
@@ -72,7 +72,7 @@ def presets
7272
dimensions: 1024,
7373
max_sequence_length: 512,
7474
pg_function: "<=>",
75-
tokenizer_class: "DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer",
75+
tokenizer_class: "DiscourseAi::Tokenizers::MultilingualE5LargeTokenizer",
7676
provider: HUGGING_FACE,
7777
},
7878
# "text-embedding-3-large" real dimentions are 3072, but we only support up to 2000 in the
@@ -83,7 +83,7 @@ def presets
8383
dimensions: 2000,
8484
max_sequence_length: 8191,
8585
pg_function: "<=>",
86-
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
86+
tokenizer_class: "DiscourseAi::Tokenizers::OpenAiTokenizer",
8787
url: "https://api.openai.com/v1/embeddings",
8888
provider: OPEN_AI,
8989
matryoshka_dimensions: true,
@@ -97,7 +97,7 @@ def presets
9797
dimensions: 1536,
9898
max_sequence_length: 8191,
9999
pg_function: "<=>",
100-
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
100+
tokenizer_class: "DiscourseAi::Tokenizers::OpenAiTokenizer",
101101
url: "https://api.openai.com/v1/embeddings",
102102
provider: OPEN_AI,
103103
matryoshka_dimensions: true,
@@ -111,7 +111,7 @@ def presets
111111
dimensions: 1536,
112112
max_sequence_length: 8191,
113113
pg_function: "<=>",
114-
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
114+
tokenizer_class: "DiscourseAi::Tokenizers::OpenAiTokenizer",
115115
url: "https://api.openai.com/v1/embeddings",
116116
provider: OPEN_AI,
117117
provider_params: {

config/eval-llms.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ llms:
22
gpt-4o:
33
display_name: GPT-4o
44
name: gpt-4o
5-
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer
5+
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer
66
api_key_env: OPENAI_API_KEY
77
provider: open_ai
88
url: https://api.openai.com/v1/chat/completions
@@ -12,7 +12,7 @@ llms:
1212
gpt-4o-mini:
1313
display_name: GPT-4o-mini
1414
name: gpt-4o-mini
15-
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer
15+
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer
1616
api_key_env: OPENAI_API_KEY
1717
provider: open_ai
1818
url: https://api.openai.com/v1/chat/completions
@@ -22,7 +22,7 @@ llms:
2222
claude-3.5-haiku:
2323
display_name: Claude 3.5 Haiku
2424
name: claude-3-5-haiku-latest
25-
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
25+
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
2626
api_key_env: ANTHROPIC_API_KEY
2727
provider: anthropic
2828
url: https://api.anthropic.com/v1/messages
@@ -32,7 +32,7 @@ llms:
3232
claude-3.5-sonnet:
3333
display_name: Claude 3.5 Sonnet
3434
name: claude-3-5-sonnet-latest
35-
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
35+
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
3636
api_key_env: ANTHROPIC_API_KEY
3737
provider: anthropic
3838
url: https://api.anthropic.com/v1/messages
@@ -42,7 +42,7 @@ llms:
4242
claude-3.7-sonnet:
4343
display_name: Claude 3.7 Sonnet
4444
name: claude-3-7-sonnet-latest
45-
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
45+
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
4646
api_key_env: ANTHROPIC_API_KEY
4747
provider: anthropic
4848
url: https://api.anthropic.com/v1/messages
@@ -52,7 +52,7 @@ llms:
5252
claude-3.7-sonnet-thinking:
5353
display_name: Claude 3.7 Sonnet
5454
name: claude-3-7-sonnet-latest
55-
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer
55+
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer
5656
api_key_env: ANTHROPIC_API_KEY
5757
provider: anthropic
5858
url: https://api.anthropic.com/v1/messages
@@ -67,7 +67,7 @@ llms:
6767
gemini-2.0-flash:
6868
display_name: Gemini 2.0 Flash
6969
name: gemini-2-0-flash
70-
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer
70+
tokenizer: DiscourseAi::Tokenizers::GeminiTokenizer
7171
api_key_env: GEMINI_API_KEY
7272
provider: google
7373
url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash
@@ -77,7 +77,7 @@ llms:
7777
gemini-2.0-pro-exp:
7878
display_name: Gemini 2.0 pro
7979
name: gemini-2-0-pro-exp
80-
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer
80+
tokenizer: DiscourseAi::Tokenizers::GeminiTokenizer
8181
api_key_env: GEMINI_API_KEY
8282
provider: google
8383
url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-pro-exp

lib/automation/llm_triage.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,12 @@ def self.handle(
4040

4141
content = "title: #{post.topic.title}\n#{post.raw}"
4242

43-
content = llm.tokenizer.truncate(content, max_post_tokens) if max_post_tokens.present?
43+
content =
44+
llm.tokenizer.truncate(
45+
content,
46+
max_post_tokens,
47+
strict: SiteSetting.ai_strict_token_counting,
48+
) if max_post_tokens.present?
4449

4550
if post.upload_ids.present?
4651
content = [content]

lib/automation/report_context_generator.rb

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def initialize(
2626
@tags = tags
2727
@allow_secure_categories = allow_secure_categories
2828
@max_posts = max_posts
29-
@tokenizer = tokenizer || DiscourseAi::Tokenizer::OpenAiTokenizer
29+
@tokenizer = tokenizer || DiscourseAi::Tokenizers::OpenAiTokenizer
3030
@tokens_per_post = tokens_per_post
3131
@prioritized_group_ids = prioritized_group_ids
3232

@@ -99,7 +99,12 @@ def format_post(post)
9999
buffer << post.created_at.strftime("%Y-%m-%d %H:%M")
100100
buffer << "user: #{post.user&.username}"
101101
buffer << "likes: #{post.like_count}"
102-
excerpt = @tokenizer.truncate(post.raw, @tokens_per_post)
102+
excerpt =
103+
@tokenizer.truncate(
104+
post.raw,
105+
@tokens_per_post,
106+
strict: SiteSetting.ai_strict_token_counting,
107+
)
103108
excerpt = "excerpt: #{excerpt}..." if excerpt.length < post.raw.length
104109
buffer << "#{excerpt}"
105110
{ likes: post.like_count, info: buffer.join("\n") }

lib/completions/dialects/dialect.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def trim_messages(messages)
147147
system_message[:content] = tokenizer.truncate(
148148
system_message[:content],
149149
max_system_tokens,
150+
strict: SiteSetting.ai_strict_token_counting,
150151
)
151152
end
152153

lib/completions/dialects/fake.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def can_translate?(llm_model)
1111
end
1212

1313
def tokenizer
14-
DiscourseAi::Tokenizer::OpenAiTokenizer
14+
DiscourseAi::Tokenizers::OpenAiTokenizer
1515
end
1616

1717
def translate

lib/completions/dialects/open_ai_compatible.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def can_translate?(_llm_model)
1212
end
1313

1414
def tokenizer
15-
llm_model&.tokenizer_class || DiscourseAi::Tokenizer::Llama3Tokenizer
15+
llm_model&.tokenizer_class || DiscourseAi::Tokenizers::Llama3Tokenizer
1616
end
1717

1818
def tools

lib/completions/endpoints/canned_response.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def perform_completion!(
7474
end
7575

7676
def tokenizer
77-
DiscourseAi::Tokenizer::OpenAiTokenizer
77+
DiscourseAi::Tokenizers::OpenAiTokenizer
7878
end
7979

8080
private

lib/completions/llm.rb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def presets
5959
output_cost: 75,
6060
},
6161
],
62-
tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer,
62+
tokenizer: DiscourseAi::Tokenizers::AnthropicTokenizer,
6363
endpoint: "https://api.anthropic.com/v1/messages",
6464
provider: "anthropic",
6565
},
@@ -103,7 +103,7 @@ def presets
103103
output_cost: 0.30,
104104
},
105105
],
106-
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer,
106+
tokenizer: DiscourseAi::Tokenizers::GeminiTokenizer,
107107
provider: "google",
108108
},
109109
{
@@ -150,7 +150,7 @@ def presets
150150
output_cost: 0.40,
151151
},
152152
],
153-
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
153+
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer,
154154
endpoint: "https://api.openai.com/v1/chat/completions",
155155
provider: "open_ai",
156156
},
@@ -172,7 +172,7 @@ def presets
172172
output_cost: 0.20,
173173
},
174174
],
175-
tokenizer: DiscourseAi::Tokenizer::Llama3Tokenizer,
175+
tokenizer: DiscourseAi::Tokenizers::Llama3Tokenizer,
176176
endpoint: "https://api.sambanova.ai/v1/chat/completions",
177177
provider: "samba_nova",
178178
},
@@ -190,7 +190,7 @@ def presets
190190
display_name: "Pixtral Large",
191191
},
192192
],
193-
tokenizer: DiscourseAi::Tokenizer::MixtralTokenizer,
193+
tokenizer: DiscourseAi::Tokenizers::MixtralTokenizer,
194194
endpoint: "https://api.mistral.ai/v1/chat/completions",
195195
provider: "mistral",
196196
},
@@ -217,7 +217,7 @@ def presets
217217
output_cost: 0.25,
218218
},
219219
],
220-
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
220+
tokenizer: DiscourseAi::Tokenizers::OpenAiTokenizer,
221221
endpoint: "https://openrouter.ai/api/v1/chat/completions",
222222
provider: "open_router",
223223
},
@@ -248,7 +248,7 @@ def provider_names
248248
end
249249

250250
def tokenizer_names
251-
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
251+
DiscourseAi::Tokenizers::BasicTokenizer.available_llm_tokenizers.map(&:name)
252252
end
253253

254254
def valid_provider_models

0 commit comments

Comments
 (0)