Skip to content

Commit f26e177

Browse files
Prefer some vector models to others (#2263)
Some vector models are better than others! 😅 We currently have a heuristic that any vector model is better than Elasticsearch. The intention of this PR is to extend that to also prefer some vector models to others. In particular, OpenAI > Paraphrase > (default) > Elasticsearch. References: CV2-6265. --------- Co-authored-by: computermacgyver <computermacgyver> Co-authored-by: Caio <117518+caiosba@users.noreply.github.com>
1 parent 770796b commit f26e177

File tree

4 files changed

+199
-6
lines changed

4 files changed

+199
-6
lines changed

app/models/bot/alegre.rb

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ class Error < ::StandardError
1717
OPENAI_ADA_MODEL = 'openai-text-embedding-ada-002'
1818
PARAPHRASE_MULTILINGUAL_MODEL = 'paraphrase-multilingual-mpnet-base-v2'
1919
ELASTICSEARCH_MODEL = 'elasticsearch'
20+
21+
TEXT_MODEL_RANKS = { # Higher is better
22+
Bot::Alegre::OPENAI_ADA_MODEL => 3,
23+
Bot::Alegre::PARAPHRASE_MULTILINGUAL_MODEL => 2,
24+
Bot::Alegre::FILIPINO_MODEL => 2,
25+
Bot::Alegre::MEAN_TOKENS_MODEL => 1,
26+
Bot::Alegre::INDIAN_MODEL => 1,
27+
Bot::Alegre::ELASTICSEARCH_MODEL => 0
28+
}
29+
2030
DEFAULT_ES_SCORE = 10
2131

2232
REPORT_TEXT_SIMILARITY_FIELDS = ['report_text_title', 'report_text_content', 'report_visual_card_title', 'report_visual_card_content']
@@ -173,7 +183,6 @@ def self.run(body)
173183
handled
174184
end
175185

176-
177186
def self.get_number_of_words(text)
178187
# Get the number of space-separated words (Does not work with Chinese/Japanese)
179188
space_separted_words = text.to_s.gsub(/[^\p{L}\s]/u, '').strip.chomp.split(/\s+/).size
@@ -504,11 +513,34 @@ def self.build_context(team_id, fields = nil)
504513
end
505514

506515
def self.return_prioritized_matches(pm_id_scores)
516+
# Examples for "pm_id_scores":
517+
# pm_id_scores = [ # Array
518+
# { score: 0.75, context: { 'team_id' => 1, 'project_media_id' => 2, 'has_custom_id' => true, 'field' => 'original_title', 'temporary_media' => false }, model: Bot::Alegre::OPENAI_ADA_MODEL },
519+
# { score: 0.85, context: { 'team_id' => 1, 'project_media_id' => 3, 'has_custom_id' => true, 'field' => 'original_title', 'temporary_media' => false }, model: Bot::Alegre::MEAN_TOKENS_MODEL }
520+
# ]
521+
# pm_id_scores = { # Hash
522+
# 2 => {
523+
# score: 0.75,
524+
# context: { 'has_custom_id' => true, 'field' => 'original_description', 'project_media_id' => 2, 'temporary_media' => false, 'team_id' => 1 },
525+
# model: Bot::Alegre::OPENAI_ADA_MODEL,
526+
# source_field: 'original_description',
527+
# target_field: 'original_description',
528+
# relationship_type: { source: 'confirmed_sibling', target: 'confirmed_sibling' }
529+
# },
530+
# 3 => {
531+
# score: 0.85,
532+
# context: { 'has_custom_id' => true, 'field' => 'original_description', 'project_media_id' => 3, 'temporary_media' => false, 'team_id' => 1 },
533+
# model: Bot::Alegre::MEAN_TOKENS_MODEL,
534+
# source_field: 'original_description',
535+
# target_field: 'original_description',
536+
# relationship_type: { source: 'confirmed_sibling', target: 'confirmed_sibling' }
537+
# }
538+
# }
507539
if pm_id_scores.is_a?(Hash)
508-
# make K negative so that we bias towards older IDs
509-
pm_id_scores.sort_by{|k,v| [Bot::Alegre::ELASTICSEARCH_MODEL != v[:model] ? 1 : 0, v[:score], -k]}.reverse
540+
# Make K negative so that we bias towards older IDs
541+
pm_id_scores.sort_by{ |k,v| [Bot::Alegre::TEXT_MODEL_RANKS.fetch(v[:model], 1), v[:score], -k] }.reverse
510542
elsif pm_id_scores.is_a?(Array)
511-
pm_id_scores.sort_by{|v| [Bot::Alegre::ELASTICSEARCH_MODEL != v[:model] ? 1 : 0, v[:score]]}.reverse
543+
pm_id_scores.sort_by{ |v| [Bot::Alegre::TEXT_MODEL_RANKS.fetch(v[:model], 1), v[:score]] }.reverse
512544
else
513545
Rails.logger.error("[Alegre Bot] Unknown variable type in return_prioritized_matches: ##{pm_id_scores.class}")
514546
pm_id_scores

app/models/concerns/smooch_search.rb

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,20 @@ def reject_temporary_results(results)
103103
end
104104

105105
def parse_search_results_from_alegre(results, limit, published_only, after = nil, feed_id = nil, team_ids = nil)
106-
pms = reject_temporary_results(results).sort_by{ |a| [a[1][:model] != Bot::Alegre::ELASTICSEARCH_MODEL ? 1 : 0, a[1][:score]] }.to_h.keys.reverse.collect{ |id| Relationship.confirmed_parent(ProjectMedia.find_by_id(id)) }
106+
# Example for "results":
107+
# results = {
108+
# 2 => {
109+
# score: 0.75,
110+
# context: { 'team_id' => 1, 'project_media_id' => 2, 'has_custom_id' => true, 'field' => 'claim_description_content|report_visual_card_title', 'temporary_media' => false, 'contexts_count' => 14 },
111+
# model: Bot::Alegre::FILIPINO_MODEL
112+
# },
113+
# 3 => {
114+
# score: 0.85,
115+
# context: { 'team_id' => 1, 'project_media_id' => 2, 'has_custom_id' => true, 'field' => 'claim_description_content|report_visual_card_title', 'temporary_media' => false, 'contexts_count' => 4 },
116+
# model: Bot::Alegre::MEAN_TOKENS_MODEL
117+
# }
118+
# }
119+
pms = Bot::Alegre.return_prioritized_matches(reject_temporary_results(results)).to_h.keys.collect { |id| Relationship.confirmed_parent(ProjectMedia.find_by_id(id)) }
107120
filter_search_results(pms, after, feed_id, team_ids, published_only).uniq(&:id).first(limit)
108121
end
109122

app/models/explainer.rb

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,33 @@ def self.update_paragraphs_in_alegre(id, previous_paragraphs_count, timestamp)
106106
end
107107
end
108108

109+
def self.sort_similarity_search_results(response)
110+
# Example for "response":
111+
# response = {
112+
# 'result' => [
113+
# {
114+
# 'content_hash' => 'abc123',
115+
# 'doc_id' => 'xyz321',
116+
# 'context' => { 'type' => 'explainer', 'team_id' => 1, 'language' => 'en', 'explainer_id' => 2, 'paragraph' => 1 },
117+
# 'models' => [Bot::Alegre::FILIPINO_MODEL],
118+
# 'suppress_search_response' => true,
119+
# 'content' => 'Foo',
120+
# 'created_at' => '2025-04-05T01:59:08.010665',
121+
# 'language' => nil,
122+
# 'suppress_response' => false,
123+
# 'contexts' => [{ 'type' => 'explainer', 'team_id' => 1, 'language' => 'en', 'explainer_id' => 3, 'paragraph' => 1 }],
124+
# 'model' => Bot::Alegre::FILIPINO_MODEL,
125+
# '_id' => 'qwe789',
126+
# 'id' => 'qwe789',
127+
# 'index' => 'alegre_similarity',
128+
# '_score' => 0.75,
129+
# 'score' => 0.75
130+
# }
131+
# ]
132+
# }
133+
Bot::Alegre.return_prioritized_matches(response['result'].to_a.map(&:with_indifferent_access))
134+
end
135+
109136
def self.search_by_similarity(text, language, team_id, limit, custom_threshold = nil)
110137
models_thresholds = Explainer.get_alegre_models_and_thresholds(team_id)
111138
models_thresholds.each { |model, _threshold| models_thresholds[model] = custom_threshold } unless custom_threshold.blank?
@@ -121,7 +148,7 @@ def self.search_by_similarity(text, language, team_id, limit, custom_threshold =
121148
context: context
122149
}
123150
response = Bot::Alegre.query_sync_with_params(params, 'text')
124-
results = response['result'].to_a.sort_by{ |result| [result['model'] != Bot::Alegre::ELASTICSEARCH_MODEL ? 1 : 0, result['_score']] }.reverse
151+
results = Explainer.sort_similarity_search_results(response)
125152
explainer_ids = results.collect{ |result| result.dig('context', 'explainer_id').to_i }.uniq.first(limit)
126153
explainer_ids.empty? ? Explainer.none : Explainer.where(team_id: team_id, id: explainer_ids)
127154
end

test/models/bot/alegre_5_test.rb

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
require_relative '../../test_helper'
2+
3+
class Bot::Alegre5Test < ActiveSupport::TestCase
4+
def setup
5+
@team = create_team
6+
@pm1 = create_project_media team: @team
7+
@pm2 = create_project_media team: @team
8+
@ex1 = create_explainer team: @team
9+
@ex2 = create_explainer team: @team
10+
end
11+
12+
def teardown
13+
end
14+
15+
test "should rank results based on vector models rank when prioritizing matches" do
16+
pm_id_scores_array = [
17+
{ score: 0.75, context: { 'team_id' => @team.id, 'project_media_id' => @pm1.id, 'has_custom_id' => true, 'field' => 'original_title', 'temporary_media' => false }, model: Bot::Alegre::FILIPINO_MODEL },
18+
{ score: 0.85, context: { 'team_id' => @team.id, 'project_media_id' => @pm2.id, 'has_custom_id' => true, 'field' => 'original_title', 'temporary_media' => false }, model: Bot::Alegre::MEAN_TOKENS_MODEL }
19+
]
20+
pm_id_scores_hash = {
21+
@pm1.id => {
22+
score: 0.75,
23+
context: { 'has_custom_id' => true, 'field' => 'original_description', 'project_media_id' => @pm1.id, 'temporary_media' => false, 'team_id' => @team.id },
24+
model: Bot::Alegre::FILIPINO_MODEL,
25+
source_field: 'original_description',
26+
target_field: 'original_description',
27+
relationship_type: { source: 'confirmed_sibling', target: 'confirmed_sibling' }
28+
},
29+
@pm2.id => {
30+
score: 0.85,
31+
context: { 'has_custom_id' => true, 'field' => 'original_description', 'project_media_id' => @pm2.id, 'temporary_media' => false, 'team_id' => @team.id },
32+
model: Bot::Alegre::MEAN_TOKENS_MODEL,
33+
source_field: 'original_description',
34+
target_field: 'original_description',
35+
relationship_type: { source: 'confirmed_sibling', target: 'confirmed_sibling' }
36+
}
37+
}
38+
39+
assert_equal @pm1.id, Bot::Alegre.return_prioritized_matches(pm_id_scores_hash).first.first
40+
assert_equal @pm1.id, Bot::Alegre.return_prioritized_matches(pm_id_scores_array).first.dig(:context, 'project_media_id')
41+
assert_equal @pm1.id, Bot::Alegre.return_prioritized_matches(pm_id_scores_array.reverse).first.dig(:context, 'project_media_id')
42+
43+
pm_id_scores_hash[@pm2.id][:model] = Bot::Alegre::OPENAI_ADA_MODEL
44+
pm_id_scores_array[1][:model] = Bot::Alegre::OPENAI_ADA_MODEL
45+
46+
assert_equal @pm2.id, Bot::Alegre.return_prioritized_matches(pm_id_scores_hash).first.first
47+
assert_equal @pm2.id, Bot::Alegre.return_prioritized_matches(pm_id_scores_array).first.dig(:context, 'project_media_id')
48+
assert_equal @pm2.id, Bot::Alegre.return_prioritized_matches(pm_id_scores_array.reverse).first.dig(:context, 'project_media_id')
49+
end
50+
51+
test "should rank results based on vector models rank when parsing fact-check search results" do
52+
results = {
53+
@pm1.id => {
54+
score: 0.75,
55+
context: { 'team_id' => @team.id, 'project_media_id' => @pm1.id, 'has_custom_id' => true, 'field' => 'claim_description_content|report_visual_card_title', 'temporary_media' => false, 'contexts_count' => 14 },
56+
model: Bot::Alegre::FILIPINO_MODEL
57+
},
58+
@pm2.id => {
59+
score: 0.85,
60+
context: { 'team_id' => @team.id, 'project_media_id' => @pm2.id, 'has_custom_id' => true, 'field' => 'claim_description_content|report_visual_card_title', 'temporary_media' => false, 'contexts_count' => 4 },
61+
model: Bot::Alegre::MEAN_TOKENS_MODEL
62+
}
63+
}
64+
65+
assert_equal @pm1.id, Bot::Smooch.parse_search_results_from_alegre(results, 10, false).first.id
66+
67+
results[@pm2.id][:model] = Bot::Alegre::OPENAI_ADA_MODEL
68+
69+
assert_equal @pm2.id, Bot::Smooch.parse_search_results_from_alegre(results, 10, false).first.id
70+
end
71+
72+
test "should rank results based on vector models rank when parsing explainer search results" do
73+
response = {
74+
'result' => [
75+
{
76+
'content_hash' => 'abc123',
77+
'doc_id' => 'xyz321',
78+
'context' => { 'type' => 'explainer', 'team_id' => @team.id, 'language' => 'en', 'explainer_id' => @ex1.id, 'paragraph' => 1 },
79+
'models' => [Bot::Alegre::FILIPINO_MODEL],
80+
'suppress_search_response' => true,
81+
'content' => 'Foo',
82+
'created_at' => '2025-04-05T01:59:08.010665',
83+
'language' => nil,
84+
'suppress_response' => false,
85+
'contexts' => [{ 'type' => 'explainer', 'team_id' => @team.id, 'language' => 'en', 'explainer_id' => @ex1.id, 'paragraph' => 1 }],
86+
'model' => Bot::Alegre::FILIPINO_MODEL,
87+
'_id' => 'qwe789',
88+
'id' => 'qwe789',
89+
'index' => 'alegre_similarity',
90+
'_score' => 0.75,
91+
'score' => 0.75
92+
},
93+
{
94+
'content_hash' => 'abc456',
95+
'doc_id' => 'xyz654',
96+
'context' => { 'type' => 'explainer', 'team_id' => @team.id, 'language' => 'en', 'explainer_id' => @ex2.id, 'paragraph' => 1 },
97+
'models' => [Bot::Alegre::MEAN_TOKENS_MODEL],
98+
'suppress_search_response' => true,
99+
'content' => 'Foo',
100+
'created_at' => '2025-04-04T01:59:08.010665',
101+
'language' => nil,
102+
'suppress_response' => false,
103+
'contexts' => [{ 'type' => 'explainer', 'team_id' => @team.id, 'language' => 'en', 'explainer_id' => @ex2.id, 'paragraph' => 1 }],
104+
'model' => Bot::Alegre::MEAN_TOKENS_MODEL,
105+
'_id' => 'qwe987',
106+
'id' => 'qwe987',
107+
'index' => 'alegre_similarity',
108+
'_score' => 0.85,
109+
'score' => 0.85
110+
}
111+
]
112+
}
113+
114+
assert_equal @ex1.id, Explainer.sort_similarity_search_results(response).first.dig('context', 'explainer_id')
115+
116+
response['result'][1]['model'] = Bot::Alegre::OPENAI_ADA_MODEL
117+
response['result'][1]['models'] = [Bot::Alegre::OPENAI_ADA_MODEL]
118+
119+
assert_equal @ex2.id, Explainer.sort_similarity_search_results(response).first.dig('context', 'explainer_id')
120+
end
121+
end

0 commit comments

Comments
 (0)