From 04e2df7796998d6379c499931f2ad4614e38b973 Mon Sep 17 00:00:00 2001 From: Rafael Silva Date: Wed, 4 Dec 2024 11:55:08 -0300 Subject: [PATCH 1/2] FIX: More resilient sentiment backfill query --- lib/sentiment/post_classification.rb | 74 +++++++++---------- .../sentiment/post_classification_spec.rb | 2 + 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/lib/sentiment/post_classification.rb b/lib/sentiment/post_classification.rb index cf558f6f4..c2dfae443 100644 --- a/lib/sentiment/post_classification.rb +++ b/lib/sentiment/post_classification.rb @@ -5,43 +5,43 @@ module Sentiment class PostClassification def self.backfill_query(from_post_id: nil, max_age_days: nil) available_classifier_names = - DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema - .values - .map { |mc| mc.model_name.downcase } - .sort - - base_query = - Post - .includes(:sentiment_classifications) - .joins("INNER JOIN topics ON topics.id = posts.topic_id") - .where(post_type: Post.types[:regular]) - .where.not(topics: { archetype: Archetype.private_message }) - .where(posts: { deleted_at: nil }) - .where(topics: { deleted_at: nil }) - .joins(<<~SQL) - LEFT JOIN classification_results crs - ON crs.target_id = posts.id - AND crs.target_type = 'Post' - AND crs.classification_type = 'sentiment' - SQL - .group("posts.id") - .having(<<~SQL, available_classifier_names) - COUNT(crs.model_used) = 0 - OR array_agg( - DISTINCT LOWER(crs.model_used) ORDER BY LOWER(crs.model_used) - )::text[] IS DISTINCT FROM array[?] - SQL - - base_query = base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present? - - if max_age_days.present? - base_query = - base_query.where( - "posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'", - ) - end - - base_query + DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.map { _1.model_name } + + queries = + available_classifier_names.map do |classifier_name| + base_query = + Post + .includes(:sentiment_classifications) + .joins("INNER JOIN topics ON topics.id = posts.topic_id") + .where(post_type: Post.types[:regular]) + .where.not(topics: { archetype: Archetype.private_message }) + .where(posts: { deleted_at: nil }) + .where(topics: { deleted_at: nil }) + .joins(<<~SQL) + LEFT JOIN classification_results crs + ON crs.target_id = posts.id + AND crs.target_type = 'Post' + AND crs.classification_type = 'sentiment' + AND crs.model_used = '#{classifier_name}' + SQL + .where("crs.id IS NULL") + + base_query = + base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present? + + if max_age_days.present? + base_query = + base_query.where( + "posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'", + ) + end + + base_query + end + + unioned_queries = queries.map(&:to_sql).join(" UNION ") + + Post.from(Arel.sql("(#{unioned_queries}) as posts")) end def bulk_classify!(relation) diff --git a/spec/lib/modules/sentiment/post_classification_spec.rb b/spec/lib/modules/sentiment/post_classification_spec.rb index f63da694e..ec1862a13 100644 --- a/spec/lib/modules/sentiment/post_classification_spec.rb +++ b/spec/lib/modules/sentiment/post_classification_spec.rb @@ -132,6 +132,8 @@ def check_classification_for(post) posts = described_class.backfill_query + debugger + expect(posts).to be_empty end From 32e09defb238cfb2c2007b98e5b740cb58965472 Mon Sep 17 00:00:00 2001 From: Rafael Silva Date: Wed, 4 Dec 2024 11:58:20 -0300 Subject: [PATCH 2/2] oops debugger --- spec/lib/modules/sentiment/post_classification_spec.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/spec/lib/modules/sentiment/post_classification_spec.rb b/spec/lib/modules/sentiment/post_classification_spec.rb index ec1862a13..f63da694e 100644 --- a/spec/lib/modules/sentiment/post_classification_spec.rb +++ b/spec/lib/modules/sentiment/post_classification_spec.rb @@ -132,8 +132,6 @@ def check_classification_for(post) posts = described_class.backfill_query - debugger - expect(posts).to be_empty end