@@ -47,11 +47,13 @@ def train_only(trained_message)
47
47
@classifier_state . spam_counts [ token ] = @classifier_state . spam_counts . fetch ( token , 0 ) + 1
48
48
@vocabulary . add ( token )
49
49
end
50
- else # :ham
50
+ else # :ham - FALSE POSITIVE BIAS: count ham tokens double
51
+ # https://www.paulgraham.com/better.html
51
52
@classifier_state . total_ham_messages += 1
52
- @classifier_state . total_ham_words += tokens . size
53
+ @classifier_state . total_ham_words += tokens . size * 2 # Double
54
+ # count for bias
53
55
tokens . each do |token |
54
- @classifier_state . ham_counts [ token ] = @classifier_state . ham_counts . fetch ( token , 0 ) + 1
56
+ @classifier_state . ham_counts [ token ] = @classifier_state . ham_counts . fetch ( token , 0 ) + 2 # Double weight
55
57
@vocabulary . add ( token )
56
58
end
57
59
end
@@ -70,49 +72,44 @@ def train_batch(trained_messages)
70
72
end
71
73
@classifier_state . save!
72
74
end
73
-
74
75
def classify ( message_text )
75
- # P(Spam|Words) = P(Words|Spam) * P(Spam) / P(Words)
76
- # Return false if the model isn't trained enough
77
76
@classifier_state . reload
78
- return [ false , 0.0 , 0.0 ] if @classifier_state . total_ham_messages == 0 || @classifier_state . total_spam_messages == 0
77
+ return [ false , 0.0 ] if @classifier_state . total_ham_messages . zero? || @classifier_state . total_spam_messages . zero?
79
78
80
- tokens = tokenize ( message_text )
81
79
total_messages = @classifier_state . total_spam_messages + @classifier_state . total_ham_messages
82
80
83
- # Calculate prior probabilities in log space
84
- # Use Math.log to resolve numerical underflow problem
85
- prob_spam_prior = Math . log ( @classifier_state . total_spam_messages . to_f / total_messages )
86
- prob_ham_prior = Math . log ( @classifier_state . total_ham_messages . to_f / total_messages )
81
+ # These are the actual priors
82
+ prob_spam_prior = @classifier_state . total_spam_messages . to_f / total_messages
83
+ prob_ham_prior = @classifier_state . total_ham_messages . to_f / total_messages
84
+
85
+ tokens = tokenize ( message_text )
87
86
88
- spam_score = prob_spam_prior
89
- ham_score = prob_ham_prior
87
+ # Pass the priors to the selection method for consistent logic
88
+ significant_tokens = get_significant_tokens ( tokens , prob_spam_prior , prob_ham_prior )
90
89
91
- vocab_size = @classifier_state . vocabulary_size
90
+ # Start scores with the log of the priors
91
+ spam_score = Math . log ( prob_spam_prior )
92
+ ham_score = Math . log ( prob_ham_prior )
92
93
93
- tokens . each do |token |
94
- # Add 1 for Laplace smoothing, Laplace smoothing is tailored to solve zero probability problem
95
- spam_count = @classifier_state . spam_counts . fetch ( token , 0 ) + 1
96
- spam_score += Math . log ( spam_count . to_f / ( @classifier_state . total_spam_words + vocab_size ) )
94
+ significant_tokens . each do |token |
95
+ spam_likelihood , ham_likelihood = get_likelihoods ( token )
97
96
98
- ham_count = @classifier_state . ham_counts . fetch ( token , 0 ) + 1
99
- ham_score += Math . log ( ham_count . to_f / ( @classifier_state . total_ham_words + vocab_size ) )
97
+ spam_score += Math . log ( spam_likelihood )
98
+ ham_score += Math . log ( ham_likelihood )
100
99
end
101
100
102
101
diff = spam_score - ham_score
103
- # stable logistic conversion
104
- p_spam = if diff . abs > 700
105
- diff > 0 ? 1.0 : 0.0
106
- else
107
- 1.0 / ( 1.0 + Math . exp ( -diff ) )
108
- end
102
+ p_spam = 1.0 / ( 1.0 + Math . exp ( -diff ) )
109
103
110
104
confidence_threshold = Rails . application . config . probability_threshold
111
105
is_spam = p_spam >= confidence_threshold
112
- Rails . logger . info "classified_result: #{ is_spam ? "maybe_spam" : "maybe_ham" } , p_spam: #{ p_spam } , message_text: #{ message_text } "
106
+
107
+ Rails . logger . info "classified_result: #{ is_spam ? "maybe_spam" : "maybe_ham" } , p_spam: #{ p_spam . round ( 4 ) } , tokens: #{ significant_tokens . join ( ', ' ) } "
108
+
113
109
[ is_spam , spam_score , ham_score ]
114
110
end
115
111
112
+
116
113
def tokenize ( text )
117
114
cleaned_text = clean_text ( text )
118
115
# This regex pre-tokenizes the string into 4 groups:
@@ -188,6 +185,50 @@ def pure_numbers?(token)
188
185
token . match? ( /^[0-9一二三四五六七八九十百千万亿零]+$/ )
189
186
end
190
187
188
+ # It correctly calculates P(token|class) for all cases using Laplace smoothing.
189
+ def get_likelihoods ( token )
190
+ vocab_size = @classifier_state . vocabulary_size
191
+
192
+ # For a spam-only word, ham_count is 0, so ham_likelihood will be very small.
193
+ # This is the correct, mathematically consistent way to handle it.
194
+ spam_count = @classifier_state . spam_counts . fetch ( token , 0 )
195
+ spam_likelihood = ( spam_count + 1.0 ) / ( @classifier_state . total_spam_words + vocab_size )
196
+
197
+ ham_count = @classifier_state . ham_counts . fetch ( token , 0 )
198
+ ham_likelihood = ( ham_count + 1.0 ) / ( @classifier_state . total_ham_words + vocab_size )
199
+
200
+ [ spam_likelihood , ham_likelihood ]
201
+ end
202
+
203
+ # Corrected to use the actual priors when determining "interestingness"
204
+ def get_significant_tokens ( tokens , prob_spam_prior , prob_ham_prior )
205
+ # Use a Set to consider each unique token only once
206
+ unique_tokens = tokens . to_set
207
+
208
+ token_scores = unique_tokens . map do |token |
209
+ spam_likelihood , ham_likelihood = get_likelihoods ( token )
210
+
211
+ # Calculate the actual P(Spam|token) using the real priors
212
+ # P(S|W) = P(W|S)P(S) / (P(W|S)P(S) + P(W|H)P(H))
213
+ prob_word_given_spam = spam_likelihood * prob_spam_prior
214
+ prob_word_given_ham = ham_likelihood * prob_ham_prior
215
+
216
+ # Avoid division by zero if both are 0
217
+ denominator = prob_word_given_spam + prob_word_given_ham
218
+ next [ token , 0.5 ] if denominator == 0
219
+
220
+ prob = prob_word_given_spam / denominator
221
+ interestingness = ( prob - 0.5 ) . abs
222
+
223
+ [ token , interestingness ]
224
+ end
225
+
226
+ # Select the top 15 most interesting tokens
227
+ token_scores . sort_by { |_ , interest | -interest }
228
+ . first ( 15 )
229
+ . map { |token , _ | token }
230
+ end
231
+
191
232
class << self
192
233
def rebuild_all_public
193
234
Rails . logger . info "Starting rebuild for all public classifiers..."
0 commit comments