Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 9dccc1e

Browse files
authored
FEATURE: Add Qwen3 tokenizer and update Gemma to version 3 (#1440)
1 parent df925f8 commit 9dccc1e

File tree

9 files changed

+3137139
-838955
lines changed

9 files changed

+3137139
-838955
lines changed

β€Žapp/models/embedding_definition.rbβ€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def tokenizer_names
2424
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
2525
DiscourseAi::Tokenizer::OpenAiTokenizer,
2626
DiscourseAi::Tokenizer::MixtralTokenizer,
27+
DiscourseAi::Tokenizer::QwenTokenizer,
2728
].map(&:name)
2829
end
2930

β€Žlib/tokenizer/basic_tokenizer.rbβ€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def available_llm_tokenizers
1111
DiscourseAi::Tokenizer::Llama3Tokenizer,
1212
DiscourseAi::Tokenizer::MixtralTokenizer,
1313
DiscourseAi::Tokenizer::OpenAiTokenizer,
14+
DiscourseAi::Tokenizer::QwenTokenizer,
1415
]
1516
end
1617

β€Žlib/tokenizer/gemini_tokenizer.rbβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ module DiscourseAi
44
module Tokenizer
55
class GeminiTokenizer < BasicTokenizer
66
def self.tokenizer
7-
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma2.json")
7+
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma3.json")
88
end
99
end
1010
end
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module Tokenizer
5+
class QwenTokenizer < BasicTokenizer
6+
def self.tokenizer
7+
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/qwen3.json")
8+
end
9+
end
10+
end
11+
end

β€Žspec/shared/tokenizer_spec.rbβ€Ž

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,32 @@
257257
end
258258
end
259259
end
260+
261+
describe DiscourseAi::Tokenizer::QwenTokenizer do
262+
describe "#size" do
263+
describe "returns a token count" do
264+
it "for a sentence with punctuation and capitalization and numbers" do
265+
expect(described_class.size("Hello, World! 123")).to eq(8)
266+
end
267+
end
268+
end
269+
270+
describe "#truncate" do
271+
it "truncates a sentence" do
272+
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
273+
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
274+
end
275+
276+
it "truncates a sentence successfully at a multibyte unicode character" do
277+
sentence = "foo bar πŸ‘¨πŸΏβ€πŸ‘©πŸΏβ€πŸ‘§πŸΏβ€πŸ‘§πŸΏ baz qux quux corge grault garply waldo fred plugh xyzzy thud"
278+
expect(described_class.truncate(sentence, 8)).to eq("foo bar πŸ‘¨πŸΏβ€πŸ‘©")
279+
end
280+
281+
it "truncates unicode characters properly when they use more than one token per char" do
282+
sentence = "ζˆ‘ε–œζ¬’εƒζ―”θ¨"
283+
original_size = described_class.size(sentence)
284+
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
285+
original_size
286+
end
287+
end
288+
end

β€Žtokenizers/README.mdβ€Ž

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ Licensed under MIT License
3434

3535
Licensed under META LLAMA 3 COMMUNITY LICENSE
3636

37-
## Gemma 2
37+
## Gemma 3
3838

3939
Licensed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms)
40+
41+
## Qwen 3
42+
43+
Licensed under the Apache 2.0 License

0 commit comments

Comments
Β (0)