diff --git a/lib/tokenizer/open_ai_gpt4o_tokenizer.rb b/lib/tokenizer/open_ai_gpt4o_tokenizer.rb deleted file mode 100644 index bf7a28bed..000000000 --- a/lib/tokenizer/open_ai_gpt4o_tokenizer.rb +++ /dev/null @@ -1,13 +0,0 @@ -# frozen_string_literal: true - -module DiscourseAi - module Tokenizer - class OpenAiGpt4oTokenizer < OpenAiTokenizer - class << self - def tokenizer - @@tokenizer ||= Tiktoken.get_encoding("o200k_base") - end - end - end - end -end diff --git a/lib/tokenizer/open_ai_tokenizer.rb b/lib/tokenizer/open_ai_tokenizer.rb index 0fe06225a..0eab6605b 100644 --- a/lib/tokenizer/open_ai_tokenizer.rb +++ b/lib/tokenizer/open_ai_tokenizer.rb @@ -5,7 +5,7 @@ module Tokenizer class OpenAiTokenizer < BasicTokenizer class << self def tokenizer - @@tokenizer ||= Tiktoken.get_encoding("cl100k_base") + @@tokenizer ||= Tiktoken.get_encoding("o200k_base") end def tokenize(text) diff --git a/spec/lib/completions/dialects/dialect_spec.rb b/spec/lib/completions/dialects/dialect_spec.rb index f210e0c4e..4a3a766d0 100644 --- a/spec/lib/completions/dialects/dialect_spec.rb +++ b/spec/lib/completions/dialects/dialect_spec.rb @@ -99,7 +99,8 @@ def tokenizer end it "limits the system message to 60% of available tokens" do - prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens") + prompt = + DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay") prompt.push(type: :user, content: five_token_msg) dialect = TestDialect.new(prompt, llm_model) @@ -109,7 +110,7 @@ def tokenizer expect(trimmed).to eq( [ - { type: :system, content: "I'm a system message consisting of 10" }, + { type: :system, content: "I'm a system message consisting of 10 tokens" }, { type: :user, content: five_token_msg }, ], ) diff --git a/spec/lib/completions/endpoints/open_ai_spec.rb b/spec/lib/completions/endpoints/open_ai_spec.rb index 1480324a6..c5b5a4561 100644 --- a/spec/lib/completions/endpoints/open_ai_spec.rb +++ b/spec/lib/completions/endpoints/open_ai_spec.rb @@ -18,7 +18,7 @@ def response(content, tool_call: false) model: "gpt-3.5-turbo-0301", usage: { prompt_tokens: 8, - completion_tokens: 13, + completion_tokens: 12, total_tokens: 499, }, choices: [ diff --git a/spec/shared/tokenizer_spec.rb b/spec/shared/tokenizer_spec.rb index 42bf24211..46ba71ee3 100644 --- a/spec/shared/tokenizer_spec.rb +++ b/spec/shared/tokenizer_spec.rb @@ -79,7 +79,7 @@ it "truncates a sentence successfully at a multibyte unicode character" do sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud" - expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿") + expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿‍") end it "truncates unicode characters properly when they use more than one token per char" do @@ -104,17 +104,7 @@ end it "handles unicode characters properly when they use more than one token per char" do - expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false) - end - end -end - -describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do - describe "#size" do - describe "returns a token count" do - it "for a sentence with punctuation and capitalization and numbers" do - expect(described_class.size("Hello, World! 123")).to eq(6) - end + expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false) end end end