Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions lib/tokenizer/open_ai_gpt4o_tokenizer.rb

This file was deleted.

2 changes: 1 addition & 1 deletion lib/tokenizer/open_ai_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module Tokenizer
class OpenAiTokenizer < BasicTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end

def tokenize(text)
Expand Down
5 changes: 3 additions & 2 deletions spec/lib/completions/dialects/dialect_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def tokenizer
end

it "limits the system message to 60% of available tokens" do
prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens")
prompt =
DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay")
prompt.push(type: :user, content: five_token_msg)

dialect = TestDialect.new(prompt, llm_model)
Expand All @@ -109,7 +110,7 @@ def tokenizer

expect(trimmed).to eq(
[
{ type: :system, content: "I'm a system message consisting of 10" },
{ type: :system, content: "I'm a system message consisting of 10 tokens" },
{ type: :user, content: five_token_msg },
],
)
Expand Down
2 changes: 1 addition & 1 deletion spec/lib/completions/endpoints/open_ai_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def response(content, tool_call: false)
model: "gpt-3.5-turbo-0301",
usage: {
prompt_tokens: 8,
completion_tokens: 13,
completion_tokens: 12,
total_tokens: 499,
},
choices: [
Expand Down
14 changes: 2 additions & 12 deletions spec/shared/tokenizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
end

it "truncates unicode characters properly when they use more than one token per char" do
Expand All @@ -104,17 +104,7 @@
end

it "handles unicode characters properly when they use more than one token per char" do
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
end
end
end

describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(6)
end
expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
end
end
end
Expand Down
Loading