add a rule based classifier to detect message separated by space

ramsayleung · ramsayleung · commit 6e7cdc50b616 · 2025-09-23T22:41:00.000-07:00
diff --git a/app/services/rule_based_classifier.rb b/app/services/rule_based_classifier.rb
@@ -0,0 +1,41 @@
+class RuleBasedClassifier
+  CHINESE_SPACING_THRESHOLD = Rails.application.config.chinese_space_spam_threshold
+
+  def initialize(message_text)
+    @message_text = message_text
+  end
+
+  def classify
+    result = check_chinese_spacing_spam
+    if result.is_spam
+      return result
+    end
+
+    result
+  end
+
+  private
+
+  def check_chinese_spacing_spam
+    # This pattern specifically looks for a Chinese character, followed by a space,
+    # and then another Chinese character like this
+    # 跟 单 像 捡 钱 ！ 再 不 进 群 是 傻 狗 ！
+    # 懷 疑 有 特 異 功 能 ！ 總 能 提 前 知 道 ！
+    if @message_text.match?(/\p{Han}/)
+      # Count Chinese characters that are immediately followed by a space.
+      # The lookahead `(?=\s)` ensures the space itself is not included in the match,
+      # so we can count only the characters.
+      spaced_chinese_words_count = @message_text.scan(/\p{Han}(?=\s)/).size
+      chinese_chars = @message_text.scan(/\p{Han}/).size
+
+      threshold = Rails.application.config.chinese_space_spam_threshold
+      ratio = chinese_chars > 0 ? spaced_chinese_words_count.to_f / chinese_chars : 0.0
+
+      if ratio > threshold
+        Rails.logger.info "Classified as spam due to high Chinese character spacing ratio: #{ratio}"
+        return Shared::ClassificationResult.new(is_spam: true, target: "message_content")
+      end
+    end
+    Shared::ClassificationResult.new(is_spam: false, target: nil)
+  end
+end
diff --git a/app/services/spam_detection_service.rb b/app/services/spam_detection_service.rb
@@ -1,6 +1,4 @@
 class SpamDetectionService
-  ClassificationResult = Data.define(:is_spam, :target)
-
   def initialize(tg_message_struct)
     @tg_message_struct = tg_message_struct
     @group_id = tg_message_struct.chat.id
@@ -14,6 +12,12 @@ def initialize(tg_message_struct)
   def process
     return non_spam_result unless valid_message?
 
+    rule_result = RuleBasedClassifier.new(@message_text).classify
+    if rule_result.is_spam
+      create_trained_message(@message_text, rule_result.target)
+      return rule_result
+    end
+
     targets_to_check = [
       { name: "message_content", value: @message_text },
       { name: "user_name",       value: @username }
@@ -56,7 +60,7 @@ def handle_existing_message(existing_message)
     when "spam"
       @is_confident = true
       Rails.logger.info "Same message exists and already marked as spam: #{existing_message.message}, training target: #{existing_message.training_target}"
-      ClassificationResult.new(is_spam: true, target: existing_message.training_target)
+      Shared::ClassificationResult.new(is_spam: true, target: existing_message.training_target)
     when "ham"
       Rails.logger.info "Same message exists and already marked as ham: #{existing_message.message}, training target: #{existing_message.training_target}"
       non_spam_result
@@ -66,12 +70,13 @@ def handle_existing_message(existing_message)
     end
   end
 
+
   def classify_with_bayesian(target_name, target_value)
     classifier = build_classifier(target_name)
     is_spam, spam_score, ham_score = classifier.classify(target_value)
 
     Rails.logger.info "Classified '#{target_value}' against '#{target_name}': is_spam=#{is_spam}, spam_score=#{spam_score}, ham_score=#{ham_score}"
-    ClassificationResult.new(is_spam: is_spam, target: target_name)
+    Shared::ClassificationResult.new(is_spam: is_spam, target: target_name)
   end
 
   def build_classifier(target_name)
@@ -105,6 +110,6 @@ def valid_message?
   end
 
   def non_spam_result
-    ClassificationResult.new(is_spam: false, target: nil)
+    Shared::ClassificationResult.new(is_spam: false, target: nil)
   end
 end
diff --git a/config/application.rb b/config/application.rb
@@ -32,5 +32,6 @@ class Application < Rails::Application
     config.delete_message_delay = 5
     # Spam blocked probability threshold
     config.probability_threshold = 0.95
+    config.chinese_space_spam_threshold = 0.8
   end
 end
diff --git a/lib/shared/classification_result.rb b/lib/shared/classification_result.rb
@@ -0,0 +1,3 @@
+module Shared
+  ClassificationResult = Data.define(:is_spam, :target)
+end
diff --git a/lib/tasks/telegram_data_collector.rake b/lib/tasks/telegram_data_collector.rake
@@ -163,6 +163,14 @@ namespace :telegram do
     classifier = SpamClassifierService.new(group_id, group_name)
     is_spam, _, _ = classifier.classify(text_to_classify)
 
+    if training_target == :message_content && !is_spam
+      rule_based_classifier = RuleBasedClassifier.new(text_to_classify)
+      is_spam = rule_based_classifier.classify().is_spam
+      if is_spam
+        puts "maybe_spam detected by rule_based classifier: #{text_to_classify}"
+      end
+    end
+
     puts "#{training_target} classified result: #{is_spam ? 'maybe_spam' : 'maybe_ham'}"
 
     spam_count = TrainedMessage.where(message_type: [ :spam, :maybe_spam ], training_target: training_target).count
diff --git a/test/services/rule_based_classifier_test.rb b/test/services/rule_based_classifier_test.rb
@@ -0,0 +1,24 @@
+require "test_helper"
+require "minitest/mock"
+
+class RuleBasedClassifierIntegrationTest < ActiveSupport::TestCase
+  test "returns spam if message is spam with excessive spacing" do
+    spam_text = "跟 单 像 捡 钱 ！ 再 不 进 群 是 傻 狗 ！"
+    service = RuleBasedClassifier.new(spam_text)
+    result = service.send(:check_chinese_spacing_spam)
+    assert result.is_spam
+    assert_equal TrainedMessage::TrainingTarget::MESSAGE_CONTENT, result.target
+  end
+
+  test "return non-spam if message has normal spacing" do
+    text = " Q妹，我私信你 U理财 "
+    service = RuleBasedClassifier.new(text)
+    result = service.send(:check_chinese_spacing_spam)
+    assert_not result.is_spam
+
+    text = "Combot警告了Donald Williams (1/1)"
+    service = RuleBasedClassifier.new(text)
+    result = service.send(:check_chinese_spacing_spam)
+    assert_not result.is_spam
+  end
+end
diff --git a/test/services/spam_detection_service_test.rb b/test/services/spam_detection_service_test.rb
@@ -1,5 +1,6 @@
 require "test_helper"
 require "ostruct"
+require "minitest/mock"
 
 class SpamDetectionServiceIntegrationTest < ActiveSupport::TestCase
   fixtures :group_classifier_states, :trained_messages

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+module Shared`
	`2`	`+ ClassificationResult = Data.define(:is_spam, :target)`
	`3`	`+end`