Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 67a1257

Browse files
authored
FEATURE: Gemini Tokenizer (#1088)
1 parent 5a97752 commit 67a1257

File tree

7 files changed

+839001
-3
lines changed

7 files changed

+839001
-3
lines changed

β€Žapp/models/embedding_definition.rbβ€Ž

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def tokenizer_names
2020
DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer,
2121
DiscourseAi::Tokenizer::BgeLargeEnTokenizer,
2222
DiscourseAi::Tokenizer::BgeM3Tokenizer,
23-
DiscourseAi::Tokenizer::OpenAiTokenizer,
23+
DiscourseAi::Tokenizer::GeminiTokenizer,
2424
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
2525
DiscourseAi::Tokenizer::OpenAiTokenizer,
2626
].map(&:name)
@@ -61,7 +61,7 @@ def presets
6161
pg_function: "<=>",
6262
url:
6363
"https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent",
64-
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
64+
tokenizer_class: "DiscourseAi::Tokenizer::GeminiTokenizer",
6565
provider: GOOGLE,
6666
},
6767
{

β€Žlib/completions/llm.rbβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def presets
5656
display_name: "Gemini 1.5 Flash",
5757
},
5858
],
59-
tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer,
59+
tokenizer: DiscourseAi::Tokenizer::GeminiTokenizer,
6060
provider: "google",
6161
},
6262
{

β€Žlib/tokenizer/basic_tokenizer.rbβ€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ class << self
77
def available_llm_tokenizers
88
[
99
DiscourseAi::Tokenizer::AnthropicTokenizer,
10+
DiscourseAi::Tokenizer::GeminiTokenizer,
1011
DiscourseAi::Tokenizer::Llama3Tokenizer,
1112
DiscourseAi::Tokenizer::MixtralTokenizer,
1213
DiscourseAi::Tokenizer::OpenAiTokenizer,
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module Tokenizer
5+
class GeminiTokenizer < BasicTokenizer
6+
def self.tokenizer
7+
@@tokenizer ||= Tokenizers.from_file("./plugins/discourse-ai/tokenizers/gemma2.json")
8+
end
9+
end
10+
end
11+
end

β€Žspec/shared/tokenizer_spec.rbβ€Ž

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,32 @@
228228
end
229229
end
230230
end
231+
232+
describe DiscourseAi::Tokenizer::GeminiTokenizer do
233+
describe "#size" do
234+
describe "returns a token count" do
235+
it "for a sentence with punctuation and capitalization and numbers" do
236+
expect(described_class.size("Hello, World! 123")).to eq(9)
237+
end
238+
end
239+
end
240+
241+
describe "#truncate" do
242+
it "truncates a sentence" do
243+
sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
244+
expect(described_class.truncate(sentence, 3)).to eq("foo bar")
245+
end
246+
247+
it "truncates a sentence successfully at a multibyte unicode character" do
248+
sentence = "foo bar πŸ‘¨πŸΏβ€πŸ‘©πŸΏβ€πŸ‘§πŸΏβ€πŸ‘§πŸΏ baz qux quux corge grault garply waldo fred plugh xyzzy thud"
249+
expect(described_class.truncate(sentence, 8)).to eq("foo bar πŸ‘¨πŸΏβ€πŸ‘©")
250+
end
251+
252+
it "truncates unicode characters properly when they use more than one token per char" do
253+
sentence = "ζˆ‘ε–œζ¬’εƒζ―”θ¨"
254+
original_size = described_class.size(sentence)
255+
expect(described_class.size(described_class.truncate(sentence, original_size - 2))).to be <
256+
original_size
257+
end
258+
end
259+
end

β€Žtokenizers/README.mdβ€Ž

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,7 @@ Licensed under MIT License
3333
## Meta-Llama-3-70B-Instruct
3434

3535
Licensed under META LLAMA 3 COMMUNITY LICENSE
36+
37+
## Gemma 2
38+
39+
Licensed under the [Gemma Terms of Use](https://ai.google.dev/gemma/terms)

0 commit comments

Comments
Β (0)