This repository was archived by the owner on Jul 22, 2025. It is now read-only.
File tree Expand file tree Collapse file tree 9 files changed +3137139
-838955
lines changed Expand file tree Collapse file tree 9 files changed +3137139
-838955
lines changed Original file line number Diff line number Diff line change @@ -24,6 +24,7 @@ def tokenizer_names
2424 DiscourseAi ::Tokenizer ::MultilingualE5LargeTokenizer ,
2525 DiscourseAi ::Tokenizer ::OpenAiTokenizer ,
2626 DiscourseAi ::Tokenizer ::MixtralTokenizer ,
27+ DiscourseAi ::Tokenizer ::QwenTokenizer ,
2728 ] . map ( &:name )
2829 end
2930
Original file line number Diff line number Diff line change @@ -11,6 +11,7 @@ def available_llm_tokenizers
1111 DiscourseAi ::Tokenizer ::Llama3Tokenizer ,
1212 DiscourseAi ::Tokenizer ::MixtralTokenizer ,
1313 DiscourseAi ::Tokenizer ::OpenAiTokenizer ,
14+ DiscourseAi ::Tokenizer ::QwenTokenizer ,
1415 ]
1516 end
1617
Original file line number Diff line number Diff line change @@ -4,7 +4,7 @@ module DiscourseAi
44 module Tokenizer
55 class GeminiTokenizer < BasicTokenizer
66 def self . tokenizer
7- @@tokenizer ||= Tokenizers . from_file ( "./plugins/discourse-ai/tokenizers/gemma2 .json" )
7+ @@tokenizer ||= Tokenizers . from_file ( "./plugins/discourse-ai/tokenizers/gemma3 .json" )
88 end
99 end
1010 end
Original file line number Diff line number Diff line change 1+ # frozen_string_literal: true
2+
3+ module DiscourseAi
4+ module Tokenizer
5+ class QwenTokenizer < BasicTokenizer
6+ def self . tokenizer
7+ @@tokenizer ||= Tokenizers . from_file ( "./plugins/discourse-ai/tokenizers/qwen3.json" )
8+ end
9+ end
10+ end
11+ end
Original file line number Diff line number Diff line change 257257 end
258258 end
259259end
260+
261+ describe DiscourseAi ::Tokenizer ::QwenTokenizer do
262+ describe "#size" do
263+ describe "returns a token count" do
264+ it "for a sentence with punctuation and capitalization and numbers" do
265+ expect ( described_class . size ( "Hello, World! 123" ) ) . to eq ( 8 )
266+ end
267+ end
268+ end
269+
270+ describe "#truncate" do
271+ it "truncates a sentence" do
272+ sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
273+ expect ( described_class . truncate ( sentence , 3 ) ) . to eq ( "foo bar baz" )
274+ end
275+
276+ it "truncates a sentence successfully at a multibyte unicode character" do
277+ sentence = "foo bar π¨πΏβπ©πΏβπ§πΏβπ§πΏ baz qux quux corge grault garply waldo fred plugh xyzzy thud"
278+ expect ( described_class . truncate ( sentence , 8 ) ) . to eq ( "foo bar π¨πΏβπ©" )
279+ end
280+
281+ it "truncates unicode characters properly when they use more than one token per char" do
282+ sentence = "ζεζ¬’εζ―θ¨"
283+ original_size = described_class . size ( sentence )
284+ expect ( described_class . size ( described_class . truncate ( sentence , original_size - 2 ) ) ) . to be <
285+ original_size
286+ end
287+ end
288+ end
Original file line number Diff line number Diff line change @@ -34,6 +34,10 @@ Licensed under MIT License
3434
3535Licensed under META LLAMA 3 COMMUNITY LICENSE
3636
37- ## Gemma 2
37+ ## Gemma 3
3838
3939Licensed under the [ Gemma Terms of Use] ( https://ai.google.dev/gemma/terms )
40+
41+ ## Qwen 3
42+
43+ Licensed under the Apache 2.0 License
You canβt perform that action at this time.
0 commit comments