Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/models/embedding_definition.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def tokenizer_names
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
DiscourseAi::Tokenizer::MixtralTokenizer,
DiscourseAi::Tokenizer::QwenTokenizer,
].map(&:name)
end

Expand Down
1 change: 1 addition & 0 deletions lib/tokenizer/basic_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def available_llm_tokenizers
DiscourseAi::Tokenizer::Llama3Tokenizer,
DiscourseAi::Tokenizer::MixtralTokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
DiscourseAi::Tokenizer::QwenTokenizer,
]
end

Expand Down
2 changes: 1 addition & 1 deletion lib/tokenizer/gemini_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ module DiscourseAi
module Tokenizer
class GeminiTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma2.json")
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma3.json")
end
end
end
Expand Down
11 changes: 11 additions & 0 deletions lib/tokenizer/qwen_tokenizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true

module DiscourseAi
module Tokenizer
class QwenTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/qwen3.json")
end
end
end
end
29 changes: 29 additions & 0 deletions spec/shared/tokenizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,32 @@
end
end
end

describe DiscourseAi::Tokenizer::QwenTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(8)
end
end
end

describe "#truncate" do
it "truncates a sentence" do
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 3)).to eq("foo bar baz")
end

it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 8)).to eq("foo bar 👨🏿‍👩")
end

it "truncates unicode characters properly when they use more than one token per char" do
sentence = "我喜欢吃比萨"
original_size = described_class.size(sentence)
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
original_size
end
end
end
6 changes: 5 additions & 1 deletion tokenizers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ Licensed under MIT License

Licensed under META LLAMA 3 COMMUNITY LICENSE

## Gemma 2
## Gemma 3

Licensed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms)

## Qwen 3

Licensed under the Apache 2.0 License
Loading
Loading