This repository was archived by the owner on Jul 22, 2025. It is now read-only.
File tree Expand file tree Collapse file tree 7 files changed +839001
-3
lines changed Expand file tree Collapse file tree 7 files changed +839001
-3
lines changed Original file line number Diff line number Diff line change @@ -20,7 +20,7 @@ def tokenizer_names
2020 DiscourseAi ::Tokenizer ::AllMpnetBaseV2Tokenizer ,
2121 DiscourseAi ::Tokenizer ::BgeLargeEnTokenizer ,
2222 DiscourseAi ::Tokenizer ::BgeM3Tokenizer ,
23- DiscourseAi ::Tokenizer ::OpenAiTokenizer ,
23+ DiscourseAi ::Tokenizer ::GeminiTokenizer ,
2424 DiscourseAi ::Tokenizer ::MultilingualE5LargeTokenizer ,
2525 DiscourseAi ::Tokenizer ::OpenAiTokenizer ,
2626 ] . map ( &:name )
@@ -61,7 +61,7 @@ def presets
6161 pg_function : "<=>" ,
6262 url :
6363 "https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent" ,
64- tokenizer_class : "DiscourseAi::Tokenizer::OpenAiTokenizer " ,
64+ tokenizer_class : "DiscourseAi::Tokenizer::GeminiTokenizer " ,
6565 provider : GOOGLE ,
6666 } ,
6767 {
Original file line number Diff line number Diff line change @@ -56,7 +56,7 @@ def presets
5656 display_name : "Gemini 1.5 Flash" ,
5757 } ,
5858 ] ,
59- tokenizer : DiscourseAi ::Tokenizer ::OpenAiTokenizer ,
59+ tokenizer : DiscourseAi ::Tokenizer ::GeminiTokenizer ,
6060 provider : "google" ,
6161 } ,
6262 {
Original file line number Diff line number Diff line change @@ -7,6 +7,7 @@ class << self
77 def available_llm_tokenizers
88 [
99 DiscourseAi ::Tokenizer ::AnthropicTokenizer ,
10+ DiscourseAi ::Tokenizer ::GeminiTokenizer ,
1011 DiscourseAi ::Tokenizer ::Llama3Tokenizer ,
1112 DiscourseAi ::Tokenizer ::MixtralTokenizer ,
1213 DiscourseAi ::Tokenizer ::OpenAiTokenizer ,
Original file line number Diff line number Diff line change 1+ # frozen_string_literal: true
2+
3+ module DiscourseAi
4+ module Tokenizer
5+ class GeminiTokenizer < BasicTokenizer
6+ def self . tokenizer
7+ @@tokenizer ||= Tokenizers . from_file ( "./plugins/discourse-ai/tokenizers/gemma2.json" )
8+ end
9+ end
10+ end
11+ end
Original file line number Diff line number Diff line change 228228 end
229229 end
230230end
231+
232+ describe DiscourseAi ::Tokenizer ::GeminiTokenizer do
233+ describe "#size" do
234+ describe "returns a token count" do
235+ it "for a sentence with punctuation and capitalization and numbers" do
236+ expect ( described_class . size ( "Hello, World! 123" ) ) . to eq ( 9 )
237+ end
238+ end
239+ end
240+
241+ describe "#truncate" do
242+ it "truncates a sentence" do
243+ sentence = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
244+ expect ( described_class . truncate ( sentence , 3 ) ) . to eq ( "foo bar" )
245+ end
246+
247+ it "truncates a sentence successfully at a multibyte unicode character" do
248+ sentence = "foo bar π¨πΏβπ©πΏβπ§πΏβπ§πΏ baz qux quux corge grault garply waldo fred plugh xyzzy thud"
249+ expect ( described_class . truncate ( sentence , 8 ) ) . to eq ( "foo bar π¨πΏβπ©" )
250+ end
251+
252+ it "truncates unicode characters properly when they use more than one token per char" do
253+ sentence = "ζεζ¬’εζ―θ¨"
254+ original_size = described_class . size ( sentence )
255+ expect ( described_class . size ( described_class . truncate ( sentence , original_size - 2 ) ) ) . to be <
256+ original_size
257+ end
258+ end
259+ end
Original file line number Diff line number Diff line change @@ -33,3 +33,7 @@ Licensed under MIT License
3333## Meta-Llama-3-70B-Instruct
3434
3535Licensed under META LLAMA 3 COMMUNITY LICENSE
36+
37+ ## Gemma 2
38+
39+ Licensed under the [ Gemma Terms of Use] ( https://ai.google.dev/gemma/terms )
You canβt perform that action at this time.
0 commit comments