Migrate sentiment to a TEI backend (#886)

xfalcox · web-flow · commit 772ee934ab77 · 2024-11-04T09:14:34.000-03:00
diff --git a/config/settings.yml b/config/settings.yml
@@ -56,22 +56,9 @@ discourse_ai:
   ai_sentiment_enabled:
     default: false
     client: true
-  ai_sentiment_inference_service_api_endpoint:
-    default: "https://sentiment-testing.demo-by-discourse.com"
-  ai_sentiment_inference_service_api_endpoint_srv:
+  ai_sentiment_model_configs:
     default: ""
-    hidden: true
-  ai_sentiment_inference_service_api_key:
-    default: ""
-    secret: true
-  ai_sentiment_models:
-    type: list
-    list_type: compact
-    default: "emotion|sentiment"
-    allow_any: false
-    choices:
-      - sentiment
-      - emotion
+    json_schema: DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
 
   ai_nsfw_detection_enabled:
     default: false
diff --git a/db/post_migrate/20241031041242_migrate_sentiment_classification_result_format.rb b/db/post_migrate/20241031041242_migrate_sentiment_classification_result_format.rb
@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+class MigrateSentimentClassificationResultFormat < ActiveRecord::Migration[7.1]
+  def up
+    DB.exec(<<~SQL)
+      UPDATE classification_results
+      SET
+        model_used = 'cardiffnlp/twitter-roberta-base-sentiment-latest',
+        classification = jsonb_build_object(
+          'neutral', (classification->>'neutral')::float / 100,
+          'negative', (classification->>'negative')::float / 100, 
+          'positive', (classification->>'positive')::float / 100
+        )
+      WHERE model_used = 'sentiment';
+
+      UPDATE classification_results
+      SET
+        model_used = 'j-hartmann/emotion-english-distilroberta-base',
+        classification = jsonb_build_object(
+          'sadness', (classification->>'sadness')::float / 100,
+          'surprise', (classification->>'surprise')::float / 100, 
+          'fear', (classification->>'fear')::float / 100,
+          'anger', (classification->>'anger')::float / 100,
+          'joy', (classification->>'joy')::float / 100,
+          'disgust', (classification->>'disgust')::float / 100,
+          'neutral', (classification->>'neutral')::float / 100
+        )
+      WHERE model_used = 'emotion';
+    SQL
+  end
+end
diff --git a/lib/inference/hugging_face_text_embeddings.rb b/lib/inference/hugging_face_text_embeddings.rb
@@ -58,6 +58,29 @@ def rerank(content, candidates)
           JSON.parse(response.body, symbolize_names: true)
         end
 
+        def classify(content, model_config)
+          headers = { "Referer" => Discourse.base_url, "Content-Type" => "application/json" }
+          headers["X-API-KEY"] = model_config.api_key
+          headers["Authorization"] = "Bearer #{model_config.api_key}"
+
+          body = { inputs: content, truncate: true }.to_json
+
+          api_endpoint = model_config.endpoint
+          if api_endpoint.present? && api_endpoint.start_with?("srv://")
+            service = DiscourseAi::Utils::DnsSrv.lookup(api_endpoint.delete_prefix("srv://"))
+            api_endpoint = "https://#{service.target}:#{service.port}"
+          end
+
+          conn = Faraday.new { |f| f.adapter FinalDestination::FaradayAdapter }
+          response = conn.post(api_endpoint, body, headers)
+
+          if response.status != 200
+            raise Net::HTTPBadResponse.new("Status: #{response.status}\n\n#{response.body}")
+          end
+
+          JSON.parse(response.body, symbolize_names: true)
+        end
+
         def reranker_configured?
           SiteSetting.ai_hugging_face_tei_reranker_endpoint.present? ||
             SiteSetting.ai_hugging_face_tei_reranker_endpoint_srv.present?
diff --git a/lib/sentiment/entry_point.rb b/lib/sentiment/entry_point.rb
@@ -16,11 +16,11 @@ def inject_into(plugin)
 
         plugin.add_report("overall_sentiment") do |report|
           report.modes = [:stacked_chart]
-          threshold = 60
+          threshold = 0.6
 
           sentiment_count_sql = Proc.new { |sentiment| <<~SQL }
             COUNT(
-              CASE WHEN (cr.classification::jsonb->'#{sentiment}')::integer > :threshold THEN 1 ELSE NULL END
+              CASE WHEN (cr.classification::jsonb->'#{sentiment}')::float > :threshold THEN 1 ELSE NULL END
             ) AS #{sentiment}_count
           SQL
 
@@ -39,7 +39,7 @@ def inject_into(plugin)
             WHERE
               t.archetype = 'regular' AND
               p.user_id > 0 AND
-              cr.model_used = 'sentiment' AND
+              cr.model_used = 'cardiffnlp/twitter-roberta-base-sentiment-latest' AND
               (p.created_at > :report_start AND p.created_at < :report_end)
             GROUP BY DATE_TRUNC('day', p.created_at)
           SQL
@@ -68,11 +68,11 @@ def inject_into(plugin)
 
         plugin.add_report("post_emotion") do |report|
           report.modes = [:stacked_line_chart]
-          threshold = 30
+          threshold = 0.3
 
           emotion_count_clause = Proc.new { |emotion| <<~SQL }
     COUNT(
-      CASE WHEN (cr.classification::jsonb->'#{emotion}')::integer > :threshold THEN 1 ELSE NULL END
+      CASE WHEN (cr.classification::jsonb->'#{emotion}')::float > :threshold THEN 1 ELSE NULL END
     ) AS #{emotion}_count
   SQL
 
@@ -96,7 +96,7 @@ def inject_into(plugin)
       WHERE
         t.archetype = 'regular' AND
         p.user_id > 0 AND
-        cr.model_used = 'emotion' AND
+        cr.model_used = 'j-hartmann/emotion-english-distilroberta-base' AND
         (p.created_at > :report_start AND p.created_at < :report_end)
       GROUP BY DATE_TRUNC('day', p.created_at)
       SQL
diff --git a/lib/sentiment/sentiment_classification.rb b/lib/sentiment/sentiment_classification.rb
@@ -7,17 +7,17 @@ def type
         :sentiment
       end
 
-      def available_models
-        SiteSetting.ai_sentiment_models.split("|")
+      def available_classifiers
+        DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values
       end
 
       def can_classify?(target)
         content_of(target).present?
       end
 
       def get_verdicts(_)
-        available_models.reduce({}) do |memo, model|
-          memo[model] = false
+        available_classifiers.reduce({}) do |memo, model|
+          memo[model.model_name] = false
           memo
         end
       end
@@ -30,21 +30,23 @@ def should_flag_based_on?(_verdicts)
       def request(target_to_classify)
         target_content = content_of(target_to_classify)
 
-        available_models.reduce({}) do |memo, model|
-          memo[model] = request_with(model, target_content)
+        available_classifiers.reduce({}) do |memo, model|
+          memo[model.model_name] = request_with(target_content, model)
           memo
         end
       end
 
+      def transform_result(result)
+        hash_result = {}
+        result.each { |r| hash_result[r[:label]] = r[:score] }
+        hash_result
+      end
+
       private
 
-      def request_with(model, content)
-        ::DiscourseAi::Inference::DiscourseClassifier.perform!(
-          "#{endpoint}/api/v1/classify",
-          model,
-          content,
-          SiteSetting.ai_sentiment_inference_service_api_key,
-        )
+      def request_with(content, model_config)
+        result = ::DiscourseAi::Inference::HuggingFaceTextEmbeddings.classify(content, model_config)
+        transform_result(result)
       end
 
       def content_of(target_to_classify)
@@ -57,18 +59,6 @@ def content_of(target_to_classify)
 
         Tokenizer::BertTokenizer.truncate(content, 512)
       end
-
-      def endpoint
-        if SiteSetting.ai_sentiment_inference_service_api_endpoint_srv.present?
-          service =
-            DiscourseAi::Utils::DnsSrv.lookup(
-              SiteSetting.ai_sentiment_inference_service_api_endpoint_srv,
-            )
-          "https://#{service.target}:#{service.port}"
-        else
-          SiteSetting.ai_sentiment_inference_service_api_endpoint
-        end
-      end
     end
   end
 end
diff --git a/lib/sentiment/sentiment_site_setting_json_schema.rb b/lib/sentiment/sentiment_site_setting_json_schema.rb
@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+
+module DiscourseAi
+  module Sentiment
+    class SentimentSiteSettingJsonSchema
+      def self.schema
+        @schema ||= {
+          type: "array",
+          items: {
+            type: "object",
+            format: "table",
+            title: "model",
+            properties: {
+              model_name: {
+                type: "string",
+              },
+              endpoint: {
+                type: "string",
+              },
+              api_key: {
+                type: "string",
+              },
+            },
+            required: %w[model_name endpoint api_key],
+          },
+        }
+      end
+
+      def self.values
+        JSON.parse(SiteSetting.ai_sentiment_model_configs, object_class: OpenStruct)
+      end
+    end
+  end
+end
diff --git a/spec/db/migrate/20241031041242_migrate_sentiment_classification_result_format_spec.rb b/spec/db/migrate/20241031041242_migrate_sentiment_classification_result_format_spec.rb
@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+require Rails.root.join(
+          "plugins/discourse-ai/db/post_migrate/20241031041242_migrate_sentiment_classification_result_format",
+        )
+
+RSpec.describe MigrateSentimentClassificationResultFormat do
+  let(:connection) { ActiveRecord::Base.connection }
+
+  before { connection.execute(<<~SQL) }
+      INSERT INTO classification_results (model_used, classification, created_at, updated_at) VALUES
+        ('sentiment', '{"neutral": 65, "negative": 20, "positive": 14}', NOW(), NOW()),
+        ('emotion', '{"sadness": 10, "surprise": 15, "fear": 5, "anger": 20, "joy": 30, "disgust": 8, "neutral": 10}', NOW(), NOW());
+    SQL
+
+  after { connection.execute("DELETE FROM classification_results") }
+
+  describe "#up" do
+    before { described_class.new.up }
+
+    it "migrates sentiment classifications correctly" do
+      sentiment_result = connection.execute(<<~SQL).first
+        SELECT * FROM classification_results 
+        WHERE model_used = 'cardiffnlp/twitter-roberta-base-sentiment-latest';
+      SQL
+
+      expected_sentiment = { "neutral" => 0.65, "negative" => 0.20, "positive" => 0.14 }
+
+      expect(JSON.parse(sentiment_result["classification"])).to eq(expected_sentiment)
+    end
+
+    it "migrates emotion classifications correctly" do
+      emotion_result = connection.execute(<<~SQL).first
+        SELECT * FROM classification_results 
+        WHERE model_used = 'j-hartmann/emotion-english-distilroberta-base';
+      SQL
+
+      expected_emotion = {
+        "sadness" => 0.10,
+        "surprise" => 0.15,
+        "fear" => 0.05,
+        "anger" => 0.20,
+        "joy" => 0.30,
+        "disgust" => 0.08,
+        "neutral" => 0.10,
+      }
+
+      expect(JSON.parse(emotion_result["classification"])).to eq(expected_emotion)
+    end
+  end
+end
diff --git a/spec/fabricators/classification_result_fabricator.rb b/spec/fabricators/classification_result_fabricator.rb
@@ -6,11 +6,13 @@
 end
 
 Fabricator(:sentiment_classification, from: :classification_result) do
-  model_used "sentiment"
-  classification { { negative: 72, neutral: 23, positive: 4 } }
+  model_used "cardiffnlp/twitter-roberta-base-sentiment-latest"
+  classification { { negative: 0.72, neutral: 0.23, positive: 0.4 } }
 end
 
 Fabricator(:emotion_classification, from: :classification_result) do
-  model_used "emotion"
-  classification { { negative: 72, neutral: 23, positive: 4 } }
+  model_used "j-hartmann/emotion-english-distilroberta-base"
+  classification do
+    { sadness: 0.72, surprise: 0.23, fear: 0.4, anger: 0.87, joy: 0.22, disgust: 0.70 }
+  end
 end
diff --git a/spec/lib/modules/sentiment/entry_point_spec.rb b/spec/lib/modules/sentiment/entry_point_spec.rb
@@ -53,16 +53,19 @@
   end
 
   describe "custom reports" do
-    before { SiteSetting.ai_sentiment_inference_service_api_endpoint = "http://test.com" }
+    before do
+      SiteSetting.ai_sentiment_model_configs =
+        "[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
+    end
 
     fab!(:pm) { Fabricate(:private_message_post) }
 
     fab!(:post_1) { Fabricate(:post) }
     fab!(:post_2) { Fabricate(:post) }
 
     describe "overall_sentiment report" do
-      let(:positive_classification) { { negative: 2, neutral: 30, positive: 70 } }
-      let(:negative_classification) { { negative: 65, neutral: 2, positive: 10 } }
+      let(:positive_classification) { { negative: 0.2, neutral: 0.3, positive: 0.7 } }
+      let(:negative_classification) { { negative: 0.65, neutral: 0.2, positive: 0.1 } }
 
       def sentiment_classification(post, classification)
         Fabricate(:sentiment_classification, target: post, classification: classification)
@@ -84,12 +87,28 @@ def sentiment_classification(post, classification)
 
     describe "post_emotion report" do
       let(:emotion_1) do
-        { sadness: 49, surprise: 23, neutral: 6, fear: 34, anger: 87, joy: 22, disgust: 70 }
+        {
+          sadness: 0.49,
+          surprise: 0.23,
+          neutral: 0.6,
+          fear: 0.34,
+          anger: 0.87,
+          joy: 0.22,
+          disgust: 0.70,
+        }
       end
       let(:emotion_2) do
-        { sadness: 19, surprise: 63, neutral: 45, fear: 44, anger: 27, joy: 62, disgust: 30 }
+        {
+          sadness: 0.19,
+          surprise: 0.63,
+          neutral: 0.45,
+          fear: 0.44,
+          anger: 0.27,
+          joy: 0.62,
+          disgust: 0.30,
+        }
       end
-      let(:model_used) { "emotion" }
+      let(:model_used) { "j-hartmann/emotion-english-distilroberta-base" }
 
       def emotion_classification(post, classification)
         Fabricate(
@@ -106,7 +125,7 @@ def strip_emoji_and_downcase(str)
       end
 
       it "calculate averages using only public posts" do
-        threshold = 30
+        threshold = 0.30
 
         emotion_classification(post_1, emotion_1)
         emotion_classification(post_2, emotion_2)
diff --git a/spec/lib/modules/sentiment/jobs/regular/post_sentiment_analysis_spec.rb b/spec/lib/modules/sentiment/jobs/regular/post_sentiment_analysis_spec.rb
@@ -8,7 +8,8 @@
 
     before do
       SiteSetting.ai_sentiment_enabled = true
-      SiteSetting.ai_sentiment_inference_service_api_endpoint = "http://test.com"
+      SiteSetting.ai_sentiment_model_configs =
+        "[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
     end
 
     describe "scenarios where we return early without doing anything" do
@@ -42,7 +43,8 @@
     end
 
     it "successfully classifies the post" do
-      expected_analysis = SiteSetting.ai_sentiment_models.split("|").length
+      expected_analysis =
+        DiscourseAi::Sentiment::SentimentClassification.new.available_classifiers.length
       SentimentInferenceStubs.stub_classification(post)
 
       subject.execute({ post_id: post.id })
diff --git a/spec/lib/modules/sentiment/sentiment_classification_spec.rb b/spec/lib/modules/sentiment/sentiment_classification_spec.rb
diff --git a/spec/shared/classificator_spec.rb b/spec/shared/classificator_spec.rb
diff --git a/spec/support/sentiment_inference_stubs.rb b/spec/support/sentiment_inference_stubs.rb
diff --git a/spec/tasks/backfill_spec.rb b/spec/tasks/backfill_spec.rb