Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit a40e2d3

Browse files
authored
FEATURE: Update OpenAI tokenizer to GPT-4o and later (#1467)
1 parent 2fe99a0 commit a40e2d3

File tree

5 files changed

+7
-29
lines changed

5 files changed

+7
-29
lines changed

lib/tokenizer/open_ai_gpt4o_tokenizer.rb

Lines changed: 0 additions & 13 deletions
This file was deleted.

lib/tokenizer/open_ai_tokenizer.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module Tokenizer
55
class OpenAiTokenizer < BasicTokenizer
66
class << self
77
def tokenizer
8-
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
8+
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
99
end
1010

1111
def tokenize(text)

spec/lib/completions/dialects/dialect_spec.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ def tokenizer
9999
end
100100

101101
it "limits the system message to 60% of available tokens" do
102-
prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens")
102+
prompt =
103+
DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay")
103104
prompt.push(type: :user, content: five_token_msg)
104105

105106
dialect = TestDialect.new(prompt, llm_model)
@@ -109,7 +110,7 @@ def tokenizer
109110

110111
expect(trimmed).to eq(
111112
[
112-
{ type: :system, content: "I'm a system message consisting of 10" },
113+
{ type: :system, content: "I'm a system message consisting of 10 tokens" },
113114
{ type: :user, content: five_token_msg },
114115
],
115116
)

spec/lib/completions/endpoints/open_ai_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def response(content, tool_call: false)
1818
model: "gpt-3.5-turbo-0301",
1919
usage: {
2020
prompt_tokens: 8,
21-
completion_tokens: 13,
21+
completion_tokens: 12,
2222
total_tokens: 499,
2323
},
2424
choices: [

spec/shared/tokenizer_spec.rb

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979

8080
it "truncates a sentence successfully at a multibyte unicode character" do
8181
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
82-
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
82+
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
8383
end
8484

8585
it "truncates unicode characters properly when they use more than one token per char" do
@@ -104,17 +104,7 @@
104104
end
105105

106106
it "handles unicode characters properly when they use more than one token per char" do
107-
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
108-
end
109-
end
110-
end
111-
112-
describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
113-
describe "#size" do
114-
describe "returns a token count" do
115-
it "for a sentence with punctuation and capitalization and numbers" do
116-
expect(described_class.size("Hello, World! 123")).to eq(6)
117-
end
107+
expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
118108
end
119109
end
120110
end

0 commit comments

Comments
 (0)