add a new field to track the origin of training data

ramsayleung · ramsayleung · commit d1c22c02a0d3 · 2025-09-25T22:24:36.000-07:00
diff --git a/app/controllers/trained_messages_controller.rb b/app/controllers/trained_messages_controller.rb
@@ -17,6 +17,10 @@ def index
       @trained_messages = @trained_messages.where(group_name: params[:group_name])
     end
 
+    if params[:source].present? && params[:source] != "all"
+      @trained_messages = @trained_messages.where(source: params[:source])
+    end
+
     if params[:search].present?
       @trained_messages = @trained_messages.where("message LIKE ?", "%#{params[:search]}%")
     end
@@ -41,12 +45,13 @@ def index
     @total_pages = (@total_count.to_f / @per_page).ceil
 
     # Using unscoped ensures we get all possible options, not just the filtered ones.
-    filter_data = TrainedMessage.unscoped.distinct.pluck(:message_type, :training_target, :group_name)
+    filter_data = TrainedMessage.unscoped.distinct.pluck(:message_type, :training_target, :group_name, :source)
 
     # Get filter options
     @message_types = filter_data.map(&:first).uniq.compact.sort
     @training_targets = filter_data.map(&:second).uniq.compact.sort
     @group_names = filter_data.map(&:third).uniq.compact.sort
+    @sources = filter_data.map(&:fourth).uniq.compact.sort
   end
 
   # GET /trained_messages/1 or /trained_messages/1.json
diff --git a/app/models/trained_message.rb b/app/models/trained_message.rb
@@ -2,6 +2,7 @@ class TrainedMessage < ApplicationRecord
   enum :message_type, { spam: 0, ham: 1, untrained: 2, maybe_spam: 3, maybe_ham: 4 }
   # New enum for what is being trained
   enum :training_target, { message_content: 0, user_name: 1 }
+  enum :source, { chat: 0, feedspam_command: 1, import: 2 }
   module MessageType
     SPAM = "spam"
     HAM = "ham"
@@ -14,6 +15,11 @@ module TrainingTarget
     MESSAGE_CONTENT = "message_content"
     USER_NAME = "user_name"
   end
+  module Source
+    CHAT = "chat"
+    FEEDSPAM_COMMAND = "feedspam_command"
+    IMPORT = "import"
+  end
   GLOBAL_SHARED_MESSAGE = 0
 
   scope :shared, -> { where(group_id: GLOBAL_SHARED_MESSAGE) }
@@ -45,7 +51,7 @@ def should_ban_user
     end
 
     spam_ban_threshold = Rails.application.config.spam_ban_threshold
-    spam_count = TrainedMessage.where(group_id: self.group_id, sender_chat_id: self.sender_chat_id, message_type: :spam).count
+    spam_count = TrainedMessage.where(group_id: self.group_id, sender_chat_id: self.sender_chat_id, message_type: :spam, source: :chat).count
     chat_member = TelegramMemberFetcher.get_bot_chat_member(self.group_id)
     can_ban_user = [ "administrator", "creator" ].include?(chat_member&.status) && chat_member&.can_restrict_members
     if spam_count >= spam_ban_threshold && can_ban_user
diff --git a/app/services/telegram_botter.rb b/app/services/telegram_botter.rb
@@ -283,7 +283,8 @@ def execute_spam_training(bot, message, spam_text)
           message: spam_text,
           sender_chat_id: message.from.id,
           sender_user_name: user_name,
-          message_type: :maybe_spam
+          message_type: :maybe_spam,
+          source: :feedspam_command
         )
 
         # Show a preview of what was learned (truncated if too long)
diff --git a/app/views/trained_messages/_form.html.erb b/app/views/trained_messages/_form.html.erb
@@ -37,6 +37,14 @@
        { class: ["block shadow-sm rounded-md border px-3 py-2 mt-2 w-full", {"border-gray-400 focus:outline-blue-600": trained_message.errors[:training_target].none?, "border-red-400 focus:outline-red-600": trained_message.errors[:training_target].any?}] } %>
   </div>
 
+  <div class="my-5">
+    <%= form.label :source %>
+    <%= form.select :source, 
+          options_for_select([['Chat', 'chat'], ['Feedspam Command', 'feedspam_command']], @trained_message.feedspam_command), 
+          { prompt: 'Select training target' },
+       { class: ["block shadow-sm rounded-md border px-3 py-2 mt-2 w-full", {"border-gray-400 focus:outline-blue-600": trained_message.errors[:source].none?, "border-red-400 focus:outline-red-600": trained_message.errors[:source].any?}] } %>
+  </div>
+
   <div class="my-5">
     <%= form.label :sender_chat_id %>
     <%= form.number_field :sender_chat_id, class: ["block shadow-sm rounded-md border px-3 py-2 mt-2 w-full", {"border-gray-400 focus:outline-blue-600": trained_message.errors[:sender_chat_id].none?, "border-red-400 focus:outline-red-600": trained_message.errors[:sender_chat_id].any?}] %>
diff --git a/app/views/trained_messages/_trained_message.html.erb b/app/views/trained_messages/_trained_message.html.erb
@@ -19,6 +19,10 @@
     <strong class="block font-medium mb-1">Training target:</strong>
     <%= trained_message.training_target %>
   </div>
+  <div>
+    <strong class="block font-medium mb-1">Source:</strong>
+    <%= trained_message.source %>
+  </div>
   <div>
     <strong class="block font-medium mb-1">Sender chat id:</strong>
     <%= trained_message.sender_chat_id %>
diff --git a/app/views/trained_messages/index.html.erb b/app/views/trained_messages/index.html.erb
@@ -45,6 +45,15 @@
               { class: "w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500" } %>
         </div>
 
+        <!-- Source Filter -->
+        <div>
+          <%= form.label :source, "Source", class: "block text-sm font-medium text-gray-700 mb-1" %>
+          <%= form.select :source, 
+              options_for_select([['All Sources', 'all']] + @sources.map { |source| [source&.humanize || 'Unknown', source] }, params[:source]), 
+              {}, 
+              { class: "w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500" } %>
+        </div>
+
         <!-- Group Filter -->
         <div>
           <%= form.label :group_name, "Group", class: "block text-sm font-medium text-gray-700 mb-1" %>
@@ -130,6 +139,7 @@
                   <% end %>
                 <% end %>
               </th>
+
               <th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
                 <%= link_to trained_messages_path(params.permit!.merge(sort: 'group_name', direction: params[:sort] == 'group_name' && params[:direction] == 'asc' ? 'desc' : 'asc')), class: "hover:text-gray-700 flex items-center space-x-1" do %>
                   <span>Group</span>
diff --git a/db/migrate/20250926045204_add_source_to_trained_messages.rb b/db/migrate/20250926045204_add_source_to_trained_messages.rb
@@ -0,0 +1,5 @@
+class AddSourceToTrainedMessages < ActiveRecord::Migration[8.0]
+  def change
+    add_column :trained_messages, :source, :integer, default: 0, null: false
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
diff --git a/lib/tasks/import.rake b/lib/tasks/import.rake
@@ -38,6 +38,7 @@ namespace :import do
           record.sender_chat_id = 0
           record.sender_user_name = "CSV Import"
           record.training_target = row["target"] || "message_content"
+          record.source = :import
 
           record.save!
 
diff --git a/lib/tasks/telegram_data_collector.rake b/lib/tasks/telegram_data_collector.rake
@@ -188,7 +188,8 @@ namespace :telegram do
         message_type: is_spam ? :maybe_spam : :maybe_ham,
         sender_user_name: user_name || "Telegram collector",
         training_target: training_target,
-        sender_chat_id: user_id
+        sender_chat_id: user_id,
+        source: :import
       )
     end
   end

Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,8 @@ def execute_spam_training(bot, message, spam_text)`
`283`	`283`	`message: spam_text,`
`284`	`284`	`sender_chat_id: message.from.id,`
`285`	`285`	`sender_user_name: user_name,`
`286`		`- message_type: :maybe_spam`
	`286`	`+ message_type: :maybe_spam,`
	`287`	`+ source: :feedspam_command`
`287`	`288`	`)`
`288`	`289`
`289`	`290`	`# Show a preview of what was learned (truncated if too long)`