Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions lib/tokenizer/open_ai_gpt4o_tokenizer.rb

This file was deleted.

2 changes: 1 addition & 1 deletion lib/tokenizer/open_ai_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module Tokenizer
class OpenAiTokenizer < BasicTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end

def tokenize(text)
Expand Down
14 changes: 2 additions & 12 deletions spec/shared/tokenizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
end

it "truncates unicode characters properly when they use more than one token per char" do
Expand All @@ -104,17 +104,7 @@
end

it "handles unicode characters properly when they use more than one token per char" do
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
end
end
end

describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(6)
end
expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
end
end
end
Expand Down
Loading