Add train_batch support to train multiple messages

ramsayleung · ramsayleung · commit 1021a478230b · 2025-09-10T23:37:47.000-07:00
diff --git a/app/jobs/classifier_trainer_job.rb b/app/jobs/classifier_trainer_job.rb
@@ -2,17 +2,23 @@ class ClassifierTrainerJob < ApplicationJob
   # Job to train classifier asynchronously
   queue_as :training
 
-  def perform(trained_message)
+  def perform(trained_messages)
     Rails.logger.info "Retrain all the classifiers for public"
-    if trained_message.user_name?
+    # Separate messages by their training target
+    user_name_messages     = trained_messages.select(&:user_name?)
+    message_content_messages = trained_messages.select(&:message_content?)
+
+    if user_name_messages.any?
       GroupClassifierState.username.find_each do |classifier|
         spam_classifier = SpamClassifierService.new(classifier.group_id, classifier.group_name)
-        spam_classifier.train(trained_message)
+        spam_classifier.train_batch(user_name_messages)
       end
-    elsif trained_message.message_content?
+    end
+
+    if message_content_messages.any?
       GroupClassifierState.for_group.find_each do |classifier|
         spam_classifier = SpamClassifierService.new(classifier.group_id, classifier.group_name)
-        spam_classifier.train(trained_message)
+        spam_classifier.train_batch(message_content_messages)
       end
     end
   end
diff --git a/app/models/trained_message.rb b/app/models/trained_message.rb
@@ -57,7 +57,7 @@ def should_ban_user
   def retrain_classifier
     return if untrained?
 
-    ClassifierTrainerJob.perform_later(self)
+    ClassifierTrainerJob.perform_later([ self ])
   end
 
   private
diff --git a/app/services/spam_classifier_service.rb b/app/services/spam_classifier_service.rb
@@ -61,6 +61,13 @@ def train(trained_message)
     @classifier_state.save!
   end
 
+  def train_batch(trained_messages)
+    trained_messages.each do |trained_message|
+      train_only(trained_message)
+    end
+    @classifier_state.save!
+  end
+
   def classify(message_text)
     # P(Spam|Words) = P(Words|Spam) * P(Spam) / P(Words)
     # Return false if the model isn't trained enough
diff --git a/test/services/spam_classifier_service_test.rb b/test/services/spam_classifier_service_test.rb
@@ -188,6 +188,37 @@ def setup
     assert spam_score > ham_score, "Spam score should be higher than ham score"
   end
 
+  test "#train_batch train a list of messages and identify spam message correctly" do
+    service = SpamClassifierService.new(@group_id, @group_name)
+    service.train_batch([
+                          TrainedMessage.new(
+                            group_id: @group_id,
+                            message: "便宜的伟哥现在买",
+                            message_type: :spam,
+                            sender_chat_id: 1,
+                            sender_user_name: "s"
+                          ),
+                          TrainedMessage.new(
+                            group_id: @group_id,
+                            message: "免费点击这里",
+                            message_type: :spam,
+                            sender_chat_id: 1,
+                            sender_user_name: "s"
+                          ),
+                          TrainedMessage.new(
+                            group_id: @group_id,
+                            message: "你好，今天天气不错",
+                            message_type: :ham,
+                            sender_chat_id: 2,
+                            sender_user_name: "s"
+                          )
+                        ])
+    is_spam, spam_score, ham_score = service.classify("点击这里买伟哥")
+
+    assert is_spam, "Message should be classified as spam"
+    assert spam_score > ham_score, "Spam score should be higher than ham score"
+  end
+
   test "#classify should correctly identify a message as ham" do
     service = SpamClassifierService.new(@group_id, @group_name)
     service.train(TrainedMessage.new(