Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ evals/log
evals/cases
config/eval-llms.local.yml
# this gets rid of search results from ag, ripgrep, etc
tokenizers/
public/ai-share/highlight.min.js
2 changes: 1 addition & 1 deletion app/models/embedding_definition.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def tokenizer_names
DiscourseAi::Tokenizer::GeminiTokenizer,
DiscourseAi::Tokenizer::MultilingualE5LargeTokenizer,
DiscourseAi::Tokenizer::OpenAiTokenizer,
DiscourseAi::Tokenizer::MixtralTokenizer,
DiscourseAi::Tokenizer::MistralTokenizer,
DiscourseAi::Tokenizer::QwenTokenizer,
].map(&:name)
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

class RenameMixtralTokenizerToMistralTokenizer < ActiveRecord::Migration[7.2]
def up
execute <<~SQL
UPDATE
llm_models
SET
tokenizer = 'DiscourseAi::Tokenizer::Mistral'
WHERE
tokenizer = 'DiscourseAi::Tokenizer::Mixtral'
SQL

execute <<~SQL
UPDATE
embedding_definitions
SET
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
WHERE
tokenizer_class = 'DiscourseAi::Tokenizer::Mixtral'
SQL
end

def down
execute <<~SQL
UPDATE
llm_models
SET
tokenizer = 'DiscourseAi::Tokenizer::Mixtral'
WHERE
tokenizer = 'DiscourseAi::Tokenizer::Mistral'
SQL

execute <<~SQL
UPDATE
embedding_definitions
SET
tokenizer_class = 'DiscourseAi::Tokenizer::Mixtral'
WHERE
tokenizer_class = 'DiscourseAi::Tokenizer::Mistral'
SQL
end
end
7 changes: 6 additions & 1 deletion lib/automation/llm_triage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,12 @@ def self.handle(

content = "title: #{post.topic.title}\n#{post.raw}"

content = llm.tokenizer.truncate(content, max_post_tokens) if max_post_tokens.present?
content =
llm.tokenizer.truncate(
content,
max_post_tokens,
strict: SiteSetting.ai_strict_token_counting,
) if max_post_tokens.present?

if post.upload_ids.present?
content = [content]
Expand Down
7 changes: 6 additions & 1 deletion lib/automation/report_context_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ def format_post(post)
buffer << post.created_at.strftime("%Y-%m-%d %H:%M")
buffer << "user: #{post.user&.username}"
buffer << "likes: #{post.like_count}"
excerpt = @tokenizer.truncate(post.raw, @tokens_per_post)
excerpt =
@tokenizer.truncate(
post.raw,
@tokens_per_post,
strict: SiteSetting.ai_strict_token_counting,
)
excerpt = "excerpt: #{excerpt}..." if excerpt.length < post.raw.length
buffer << "#{excerpt}"
{ likes: post.like_count, info: buffer.join("\n") }
Expand Down
1 change: 1 addition & 0 deletions lib/completions/dialects/dialect.rb
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def trim_messages(messages)
system_message[:content] = tokenizer.truncate(
system_message[:content],
max_system_tokens,
strict: SiteSetting.ai_strict_token_counting,
)
end

Expand Down
2 changes: 1 addition & 1 deletion lib/completions/llm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def presets
display_name: "Pixtral Large",
},
],
tokenizer: DiscourseAi::Tokenizer::MixtralTokenizer,
tokenizer: DiscourseAi::Tokenizer::MistralTokenizer,
endpoint: "https://api.mistral.ai/v1/chat/completions",
provider: "mistral",
},
Expand Down
12 changes: 8 additions & 4 deletions lib/embeddings/strategies/truncation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ def prepare_target_text(target, vdef)
when Post
post_truncation(target, vdef.tokenizer, max_length)
when RagDocumentFragment
vdef.tokenizer.truncate(target.fragment, max_length)
vdef.tokenizer.truncate(
target.fragment,
max_length,
strict: SiteSetting.ai_strict_token_counting,
)
else
raise ArgumentError, "Invalid target type"
end
Expand All @@ -36,7 +40,7 @@ def prepare_query_text(text, vdef, asymetric: false)
qtext = asymetric ? "#{vdef.search_prompt} #{text}" : text
max_length = vdef.max_sequence_length - 2

vdef.tokenizer.truncate(qtext, max_length)
vdef.tokenizer.truncate(qtext, max_length, strict: SiteSetting.ai_strict_token_counting)
end

private
Expand Down Expand Up @@ -74,7 +78,7 @@ def topic_truncation(topic, tokenizer, max_length)
text << "\n\n"
end

tokenizer.truncate(text, max_length)
tokenizer.truncate(text, max_length, strict: SiteSetting.ai_strict_token_counting)
end

def post_truncation(post, tokenizer, max_length)
Expand All @@ -86,7 +90,7 @@ def post_truncation(post, tokenizer, max_length)
text << Nokogiri::HTML5.fragment(post.cooked).text
end

tokenizer.truncate(text, max_length)
tokenizer.truncate(text, max_length, strict: SiteSetting.ai_strict_token_counting)
end
end
end
Expand Down
7 changes: 6 additions & 1 deletion lib/personas/question_consolidator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ def revised_prompt
truncated_content = content

if current_tokens > allowed_tokens
truncated_content = @llm.tokenizer.truncate(content, allowed_tokens)
truncated_content =
@llm.tokenizer.truncate(
content,
allowed_tokens,
strict: SiteSetting.ai_strict_token_counting,
)
current_tokens = allowed_tokens
end

Expand Down
4 changes: 3 additions & 1 deletion lib/personas/tool_runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,9 @@ def rag_search(query, filenames: nil, limit: 10)
def attach_truncate(mini_racer_context)
mini_racer_context.attach(
"_llm_truncate",
->(text, length) { @llm.tokenizer.truncate(text, length) },
->(text, length) do
@llm.tokenizer.truncate(text, length, strict: SiteSetting.ai_strict_token_counting)
end,
)

mini_racer_context.attach(
Expand Down
5 changes: 4 additions & 1 deletion lib/personas/tools/google.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ def minimize_field(result, field, llm, max_tokens: 100)
data = result[field]
return "" if data.blank?

llm.tokenizer.truncate(data, max_tokens).squish
llm
.tokenizer
.truncate(data, max_tokens, strict: SiteSetting.ai_strict_token_counting)
.squish
end

def parse_search_json(json_data, escaped_query, llm)
Expand Down
7 changes: 6 additions & 1 deletion lib/personas/tools/setting_context.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ def invoke

result.gsub!(/^#{Regexp.escape(Rails.root.to_s)}/, "")

result = llm.tokenizer.truncate(result, MAX_CONTEXT_TOKENS)
result =
llm.tokenizer.truncate(
result,
MAX_CONTEXT_TOKENS,
strict: SiteSetting.ai_strict_token_counting,
)

{ setting_name: setting_name, context: result }
end
Expand Down
2 changes: 1 addition & 1 deletion lib/personas/tools/tool.rb
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def truncate(text, llm:, percent_length: nil, max_length: nil)
target = max_length if target > max_length
end

llm.tokenizer.truncate(text, target)
llm.tokenizer.truncate(text, target, strict: SiteSetting.ai_strict_token_counting)
end

def accepted_options
Expand Down
6 changes: 5 additions & 1 deletion lib/sentiment/post_classification.rb
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,11 @@ def prepare_text(target)
target.raw
end

Tokenizer::BertTokenizer.truncate(content, 512)
DiscourseAi::Tokenizer::BertTokenizer.truncate(
content,
512,
strict: SiteSetting.ai_strict_token_counting,
)
end

def request_with(client, content)
Expand Down
18 changes: 15 additions & 3 deletions lib/summarization/fold_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ def fold(items, user, &on_partial_blk)
items.each_with_index do |item, idx|
as_text = "(#{item[:id]} #{item[:poster]} said: #{item[:text]} "

if tokenizer.below_limit?(as_text, tokens_left)
if tokenizer.below_limit?(
as_text,
tokens_left,
strict: SiteSetting.ai_strict_token_counting,
)
content_in_window << item
tokens_left -= tokenizer.size(as_text)
else
Expand Down Expand Up @@ -151,8 +155,16 @@ def truncate(item)
tokenizer = llm_model.tokenizer_class

item[:text] = [
tokenizer.truncate(split_1, truncation_length),
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
tokenizer.truncate(
split_1,
truncation_length,
strict: SiteSetting.ai_strict_token_counting,
),
tokenizer.truncate(
split_2.reverse,
truncation_length,
strict: SiteSetting.ai_strict_token_counting,
).reverse,
].join(" ")

item
Expand Down
12 changes: 0 additions & 12 deletions lib/tokenizer/all_mpnet_base_v2_tokenizer.rb

This file was deleted.

12 changes: 0 additions & 12 deletions lib/tokenizer/anthropic_tokenizer.rb

This file was deleted.

55 changes: 0 additions & 55 deletions lib/tokenizer/basic_tokenizer.rb

This file was deleted.

12 changes: 0 additions & 12 deletions lib/tokenizer/bert_tokenizer.rb

This file was deleted.

11 changes: 0 additions & 11 deletions lib/tokenizer/bge_large_en_tokenizer.rb

This file was deleted.

11 changes: 0 additions & 11 deletions lib/tokenizer/bge_m3_tokenizer.rb

This file was deleted.

11 changes: 0 additions & 11 deletions lib/tokenizer/gemini_tokenizer.rb

This file was deleted.

12 changes: 0 additions & 12 deletions lib/tokenizer/llama3_tokenizer.rb

This file was deleted.

Loading
Loading