From 031b9b020a6ff551aa4a8698f509de9769b94962 Mon Sep 17 00:00:00 2001 From: Sawy Date: Fri, 23 Jan 2026 07:21:28 +0200 Subject: [PATCH 1/3] CV2-6562: add quote_hash to optimize query performance --- app/models/bot/smooch.rb | 3 ++- app/models/claim.rb | 5 ++-- ...123042611_add_quote_hash_to_media_table.rb | 6 +++++ db/schema.rb | 4 ++- ...250817055243_remove_blank_media_items.rake | 26 ++++++++++++++++--- 5 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 db/migrate/20260123042611_add_quote_hash_to_media_table.rb diff --git a/app/models/bot/smooch.rb b/app/models/bot/smooch.rb index 0ce5f4ae83..9740d96db4 100644 --- a/app/models/bot/smooch.rb +++ b/app/models/bot/smooch.rb @@ -838,7 +838,8 @@ def self.save_text_message(message) # strip and remove null bytes claim = self.extract_claim(text).gsub(/\s+/, ' ').strip.gsub("\u0000", "\\u0000") extra = { quote: claim } - pm = ProjectMedia.joins(:media).where('trim(lower(quote)) = ?', claim.downcase).where('project_medias.team_id' => team.id).last + hash_value = Digest::MD5.hexdigest(claim.to_s.strip.downcase) + pm = ProjectMedia.joins(:media).where(quote_hash: hash_value).where('project_medias.team_id' => team.id).last # Don't create a new text media if it's an unconfirmed request with just a few words if pm.nil? && message['archived'] == CheckArchivedFlags::FlagCodes::UNCONFIRMED && ::Bot::Alegre.get_number_of_words(claim) < self.min_number_of_words_for_tipline_long_text return team diff --git a/app/models/claim.rb b/app/models/claim.rb index cca39a1407..ec9a06b9b8 100644 --- a/app/models/claim.rb +++ b/app/models/claim.rb @@ -19,8 +19,9 @@ def remove_null_bytes end def set_uuid - uuid = Claim.where('lower(quote) = ?', self.quote.to_s.strip.downcase).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id + hash_value = Digest::MD5.hexdigest(self.quote.to_s.strip.downcase) + uuid = Claim.where(quote_hash: hash_value).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id uuid ||= self.id - self.update_column(:uuid, uuid) + self.update_columns(uuid: uuid, quote_hash: hash_value) end end diff --git a/db/migrate/20260123042611_add_quote_hash_to_media_table.rb b/db/migrate/20260123042611_add_quote_hash_to_media_table.rb new file mode 100644 index 0000000000..b25480f619 --- /dev/null +++ b/db/migrate/20260123042611_add_quote_hash_to_media_table.rb @@ -0,0 +1,6 @@ +class AddQuoteHashToMediaTable < ActiveRecord::Migration[6.1] + def change + add_column :medias, :quote_hash, :string + add_index :medias, :quote_hash + end +end diff --git a/db/schema.rb b/db/schema.rb index 1635224cf9..2d9ac469f3 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2025_11_26_195433) do +ActiveRecord::Schema.define(version: 2026_01_23_042611) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -474,8 +474,10 @@ t.integer "uuid", default: 0, null: false t.text "original_claim" t.string "original_claim_hash" + t.string "quote_hash" t.index "lower((quote)::text)", name: "index_medias_on_lower_quote", where: "((type)::text = 'Claim'::text)", using: :hash t.index ["original_claim_hash"], name: "index_medias_on_original_claim_hash", unique: true + t.index ["quote_hash"], name: "index_medias_on_quote_hash" t.index ["url"], name: "index_medias_on_url", unique: true end diff --git a/lib/tasks/migrate/20250817055243_remove_blank_media_items.rake b/lib/tasks/migrate/20250817055243_remove_blank_media_items.rake index 0bc07a011b..9d59932967 100644 --- a/lib/tasks/migrate/20250817055243_remove_blank_media_items.rake +++ b/lib/tasks/migrate/20250817055243_remove_blank_media_items.rake @@ -2,7 +2,8 @@ namespace :check do namespace :migrate do def get_claim_uuid(id, quote) - uuid = Claim.where('lower(quote) = ?', quote.to_s.strip.downcase).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id + hash_value = Digest::MD5.hexdigest(quote.to_s.strip.downcase) + uuid = Claim.where(quote_hash: hash_value).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id uuid ||= id end # bundle exec rails check:migrate:migrate_published_and_unpublished_items @@ -92,19 +93,36 @@ namespace :check do minutes = ((Time.now.to_i - started) / 60).to_i puts "[#{Time.now}] Done in #{minutes} minutes." end + # rake task to set quote_hash for Claims + # bundle exec rails check:migrate:set_claim_quote_hash + task set_claim_quote_hash: :environment do + started = Time.now.to_i + last_claim_id = Rails.cache.read('check:migrate:set_claim_quote_hash') || 0 + Claim.where('id > ?', last_claim_id) + .find_in_batches(batch_size: 2000) do |claims| + c_items = [] + claims.each do |claim| + print '.' + claim.quote_hash = Digest::MD5.hexdigest(claim.quote.to_s.strip.downcase) + c_items << claim.attributes + end + Claim.upsert_all(c_items) + Rails.cache.write('check:migrate:set_claim_quote_hash', claims.pluck(:id).max) + end + minutes = ((Time.now.to_i - started) / 60).to_i + puts "[#{Time.now}] Done in #{minutes} minutes." + end # rake task to set Claim uuid # bundle exec rails check:migrate:set_claim_uuid task set_claim_uuid: :environment do started = Time.now.to_i - last_claim_id = Rails.cache.read('check:migrate:set_claim_uuid') || 0 - Claim.where('id > ?', last_claim_id).where(uuid: 0) + Claim.where(uuid: 0) .find_in_batches(batch_size: 1000) do |claims| claims.each do |claim| print '.' uuid = get_claim_uuid(claim.id, claim.quote) claim.update_column(:uuid, uuid) end - Rails.cache.write('check:migrate:set_claim_uuid', claims.pluck(:id).max) end minutes = ((Time.now.to_i - started) / 60).to_i puts "[#{Time.now}] Done in #{minutes} minutes." From e3fa308caff367b7c5bdc97b273da144c55debc1 Mon Sep 17 00:00:00 2001 From: Sawy Date: Fri, 23 Jan 2026 10:31:04 +0200 Subject: [PATCH 2/3] CV2-6562: fix tests --- app/models/bot/smooch.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/bot/smooch.rb b/app/models/bot/smooch.rb index 9740d96db4..e85d4cf3cc 100644 --- a/app/models/bot/smooch.rb +++ b/app/models/bot/smooch.rb @@ -839,7 +839,7 @@ def self.save_text_message(message) claim = self.extract_claim(text).gsub(/\s+/, ' ').strip.gsub("\u0000", "\\u0000") extra = { quote: claim } hash_value = Digest::MD5.hexdigest(claim.to_s.strip.downcase) - pm = ProjectMedia.joins(:media).where(quote_hash: hash_value).where('project_medias.team_id' => team.id).last + pm = ProjectMedia.joins(:media).where('medias.quote_hash' => hash_value).where('project_medias.team_id' => team.id).last # Don't create a new text media if it's an unconfirmed request with just a few words if pm.nil? && message['archived'] == CheckArchivedFlags::FlagCodes::UNCONFIRMED && ::Bot::Alegre.get_number_of_words(claim) < self.min_number_of_words_for_tipline_long_text return team From 46d99030d26ed365a5d9ce348babc8b8a742d848 Mon Sep 17 00:00:00 2001 From: Sawy Date: Sun, 25 Jan 2026 08:03:25 +0200 Subject: [PATCH 3/3] CV2-6562: apply PR comments --- app/models/bot/smooch.rb | 3 +-- app/models/claim.rb | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/app/models/bot/smooch.rb b/app/models/bot/smooch.rb index e85d4cf3cc..3364f0027f 100644 --- a/app/models/bot/smooch.rb +++ b/app/models/bot/smooch.rb @@ -838,8 +838,7 @@ def self.save_text_message(message) # strip and remove null bytes claim = self.extract_claim(text).gsub(/\s+/, ' ').strip.gsub("\u0000", "\\u0000") extra = { quote: claim } - hash_value = Digest::MD5.hexdigest(claim.to_s.strip.downcase) - pm = ProjectMedia.joins(:media).where('medias.quote_hash' => hash_value).where('project_medias.team_id' => team.id).last + pm = ProjectMedia.joins(:media).where('medias.quote_hash' => Claim.generate_hash(claim)).where('project_medias.team_id' => team.id).last # Don't create a new text media if it's an unconfirmed request with just a few words if pm.nil? && message['archived'] == CheckArchivedFlags::FlagCodes::UNCONFIRMED && ::Bot::Alegre.get_number_of_words(claim) < self.min_number_of_words_for_tipline_long_text return team diff --git a/app/models/claim.rb b/app/models/claim.rb index ec9a06b9b8..ffa68a7d64 100644 --- a/app/models/claim.rb +++ b/app/models/claim.rb @@ -12,6 +12,10 @@ def media_type 'quote' end + def self.generate_hash(claim) + Digest::MD5.hexdigest(claim.to_s.strip.downcase) + end + private def remove_null_bytes @@ -19,7 +23,7 @@ def remove_null_bytes end def set_uuid - hash_value = Digest::MD5.hexdigest(self.quote.to_s.strip.downcase) + hash_value = Claim.generate_hash(self.quote) uuid = Claim.where(quote_hash: hash_value).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id uuid ||= self.id self.update_columns(uuid: uuid, quote_hash: hash_value)