Skip to content

Commit 1dad254

Browse files
Szercursoragent
andauthored
Add time-decay weighting support for ML spam detection (#114)
Add configurable time-decay weighting w(t) = exp(-k * age_days) for ML training examples via ML_WEIGHT_DECAY_K env variable (default 0 = disabled). When enabled, recent messages have higher weight in model training, making the model more responsive to new spam patterns. Also add ml-experiment.fsx script for REPL-based comparison of baseline vs weighted models with per-interval precision/recall analysis. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 9e62f78 commit 1dad254

File tree

4 files changed

+463
-2
lines changed

4 files changed

+463
-2
lines changed

src/VahterBanBot/ML.fs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ type SpamOrHam =
2222
spam: bool
2323
lessThanNMessagesF: single
2424
moreThanNEmojisF: single
25+
weight: single
2526
createdAt: DateTime }
2627

2728
[<CLIMutable>]
@@ -86,12 +87,20 @@ type MachineLearning(
8687

8788
logger.LogInformation $"Training data count: {rawData.Length}"
8889

90+
let now = DateTime.UtcNow
91+
let k = botConf.MlWeightDecayK
8992
let data =
9093
rawData
9194
|> Array.map (fun x ->
95+
let w =
96+
if k > 0.0 then
97+
single (Math.Exp(-k * (now - x.created_at).TotalDays))
98+
else
99+
1.0f
92100
{ text = x.text
93101
spam = x.spam
94102
createdAt = x.created_at
103+
weight = w
95104
moreThanNEmojisF = if x.custom_emoji_count > botConf.MlCustomEmojiThreshold then 1.0f else 0.0f
96105
lessThanNMessagesF = if x.less_than_n_messages then 1.0f else 0.0f }
97106
)
@@ -105,11 +114,21 @@ type MachineLearning(
105114
let trainingData = trainTestSplit.TrainSet
106115
let testData = trainTestSplit.TestSet
107116

108-
let dataProcessPipeline =
117+
let featurePipeline =
109118
mlContext.Transforms.Text
110119
.FeaturizeText(outputColumnName = "TextFeaturized", inputColumnName = "text")
111120
.Append(mlContext.Transforms.Concatenate(outputColumnName = "Features", inputColumnNames = [|"TextFeaturized"; "lessThanNMessagesF"; "moreThanNEmojisF"|]))
112-
.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
121+
122+
let dataProcessPipeline =
123+
if k > 0.0 then
124+
featurePipeline.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
125+
labelColumnName = "spam",
126+
featureColumnName = "Features",
127+
exampleWeightColumnName = "weight",
128+
maximumNumberOfIterations = botConf.MlMaxNumberOfIterations
129+
))
130+
else
131+
featurePipeline.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
113132
labelColumnName = "spam",
114133
featureColumnName = "Features",
115134
maximumNumberOfIterations = botConf.MlMaxNumberOfIterations
@@ -174,6 +193,7 @@ type MachineLearning(
174193
spam = false
175194
lessThanNMessagesF = if userMsgCount < botConf.MlTrainCriticalMsgCount then 1.0f else 0.0f
176195
moreThanNEmojisF = if emojiCount > botConf.MlCustomEmojiThreshold then 1.0f else 0.0f
196+
weight = 1.0f
177197
createdAt = DateTime.UtcNow }
178198
|> Some
179199
| None ->

src/VahterBanBot/Program.fs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ let botConf =
9797
MlMaxNumberOfIterations = getEnvOr "ML_MAX_NUMBER_OF_ITERATIONS" "50" |> int
9898
MlCustomEmojiThreshold = getEnvOr "ML_CUSTOM_EMOJI_THRESHOLD" "20" |> int
9999
MlStopWordsInChats = getEnvOr "ML_STOP_WORDS_IN_CHATS" "{}" |> fromJson
100+
MlWeightDecayK = getEnvOr "ML_WEIGHT_DECAY_K" "0" |> float
100101
// Reaction spam detection
101102
ReactionSpamEnabled = getEnvOr "REACTION_SPAM_ENABLED" "false" |> bool.Parse
102103
ReactionSpamMinMessages = getEnvOr "REACTION_SPAM_MIN_MESSAGES" "10" |> int

src/VahterBanBot/Types.fs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ type BotConfiguration =
5555
MlMaxNumberOfIterations: int
5656
MlCustomEmojiThreshold: int
5757
MlStopWordsInChats: Dictionary<int64, string list>
58+
/// Time-decay weight parameter: w(t) = exp(-k * age_in_days). 0 = no decay (all weights 1.0).
59+
MlWeightDecayK: float
5860
// Reaction spam detection
5961
ReactionSpamEnabled: bool
6062
ReactionSpamMinMessages: int

0 commit comments

Comments
 (0)