Skip to content

Commit 6e7cdc5

Browse files
committed
add a rule based classifier to detect message separated by space
1 parent 0534724 commit 6e7cdc5

File tree

7 files changed

+88
-5
lines changed

7 files changed

+88
-5
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
class RuleBasedClassifier
2+
CHINESE_SPACING_THRESHOLD = Rails.application.config.chinese_space_spam_threshold
3+
4+
def initialize(message_text)
5+
@message_text = message_text
6+
end
7+
8+
def classify
9+
result = check_chinese_spacing_spam
10+
if result.is_spam
11+
return result
12+
end
13+
14+
result
15+
end
16+
17+
private
18+
19+
def check_chinese_spacing_spam
20+
# This pattern specifically looks for a Chinese character, followed by a space,
21+
# and then another Chinese character like this
22+
# 跟 单 像 捡 钱 ! 再 不 进 群 是 傻 狗 !
23+
# 懷 疑 有 特 異 功 能 ! 總 能 提 前 知 道 !
24+
if @message_text.match?(/\p{Han}/)
25+
# Count Chinese characters that are immediately followed by a space.
26+
# The lookahead `(?=\s)` ensures the space itself is not included in the match,
27+
# so we can count only the characters.
28+
spaced_chinese_words_count = @message_text.scan(/\p{Han}(?=\s)/).size
29+
chinese_chars = @message_text.scan(/\p{Han}/).size
30+
31+
threshold = Rails.application.config.chinese_space_spam_threshold
32+
ratio = chinese_chars > 0 ? spaced_chinese_words_count.to_f / chinese_chars : 0.0
33+
34+
if ratio > threshold
35+
Rails.logger.info "Classified as spam due to high Chinese character spacing ratio: #{ratio}"
36+
return Shared::ClassificationResult.new(is_spam: true, target: "message_content")
37+
end
38+
end
39+
Shared::ClassificationResult.new(is_spam: false, target: nil)
40+
end
41+
end

app/services/spam_detection_service.rb

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
class SpamDetectionService
2-
ClassificationResult = Data.define(:is_spam, :target)
3-
42
def initialize(tg_message_struct)
53
@tg_message_struct = tg_message_struct
64
@group_id = tg_message_struct.chat.id
@@ -14,6 +12,12 @@ def initialize(tg_message_struct)
1412
def process
1513
return non_spam_result unless valid_message?
1614

15+
rule_result = RuleBasedClassifier.new(@message_text).classify
16+
if rule_result.is_spam
17+
create_trained_message(@message_text, rule_result.target)
18+
return rule_result
19+
end
20+
1721
targets_to_check = [
1822
{ name: "message_content", value: @message_text },
1923
{ name: "user_name", value: @username }
@@ -56,7 +60,7 @@ def handle_existing_message(existing_message)
5660
when "spam"
5761
@is_confident = true
5862
Rails.logger.info "Same message exists and already marked as spam: #{existing_message.message}, training target: #{existing_message.training_target}"
59-
ClassificationResult.new(is_spam: true, target: existing_message.training_target)
63+
Shared::ClassificationResult.new(is_spam: true, target: existing_message.training_target)
6064
when "ham"
6165
Rails.logger.info "Same message exists and already marked as ham: #{existing_message.message}, training target: #{existing_message.training_target}"
6266
non_spam_result
@@ -66,12 +70,13 @@ def handle_existing_message(existing_message)
6670
end
6771
end
6872

73+
6974
def classify_with_bayesian(target_name, target_value)
7075
classifier = build_classifier(target_name)
7176
is_spam, spam_score, ham_score = classifier.classify(target_value)
7277

7378
Rails.logger.info "Classified '#{target_value}' against '#{target_name}': is_spam=#{is_spam}, spam_score=#{spam_score}, ham_score=#{ham_score}"
74-
ClassificationResult.new(is_spam: is_spam, target: target_name)
79+
Shared::ClassificationResult.new(is_spam: is_spam, target: target_name)
7580
end
7681

7782
def build_classifier(target_name)
@@ -105,6 +110,6 @@ def valid_message?
105110
end
106111

107112
def non_spam_result
108-
ClassificationResult.new(is_spam: false, target: nil)
113+
Shared::ClassificationResult.new(is_spam: false, target: nil)
109114
end
110115
end

config/application.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,6 @@ class Application < Rails::Application
3232
config.delete_message_delay = 5
3333
# Spam blocked probability threshold
3434
config.probability_threshold = 0.95
35+
config.chinese_space_spam_threshold = 0.8
3536
end
3637
end
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module Shared
2+
ClassificationResult = Data.define(:is_spam, :target)
3+
end

lib/tasks/telegram_data_collector.rake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,14 @@ namespace :telegram do
163163
classifier = SpamClassifierService.new(group_id, group_name)
164164
is_spam, _, _ = classifier.classify(text_to_classify)
165165

166+
if training_target == :message_content && !is_spam
167+
rule_based_classifier = RuleBasedClassifier.new(text_to_classify)
168+
is_spam = rule_based_classifier.classify().is_spam
169+
if is_spam
170+
puts "maybe_spam detected by rule_based classifier: #{text_to_classify}"
171+
end
172+
end
173+
166174
puts "#{training_target} classified result: #{is_spam ? 'maybe_spam' : 'maybe_ham'}"
167175

168176
spam_count = TrainedMessage.where(message_type: [ :spam, :maybe_spam ], training_target: training_target).count
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
require "test_helper"
2+
require "minitest/mock"
3+
4+
class RuleBasedClassifierIntegrationTest < ActiveSupport::TestCase
5+
test "returns spam if message is spam with excessive spacing" do
6+
spam_text = "跟 单 像 捡 钱 ! 再 不 进 群 是 傻 狗 !"
7+
service = RuleBasedClassifier.new(spam_text)
8+
result = service.send(:check_chinese_spacing_spam)
9+
assert result.is_spam
10+
assert_equal TrainedMessage::TrainingTarget::MESSAGE_CONTENT, result.target
11+
end
12+
13+
test "return non-spam if message has normal spacing" do
14+
text = " Q妹,我私信你 U理财 "
15+
service = RuleBasedClassifier.new(text)
16+
result = service.send(:check_chinese_spacing_spam)
17+
assert_not result.is_spam
18+
19+
text = "Combot警告了Donald Williams (1/1)"
20+
service = RuleBasedClassifier.new(text)
21+
result = service.send(:check_chinese_spacing_spam)
22+
assert_not result.is_spam
23+
end
24+
end

test/services/spam_detection_service_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
require "test_helper"
22
require "ostruct"
3+
require "minitest/mock"
34

45
class SpamDetectionServiceIntegrationTest < ActiveSupport::TestCase
56
fixtures :group_classifier_states, :trained_messages

0 commit comments

Comments
 (0)