Add time-decay weighting support for ML spam detection (#114)

Szer · cursoragent · web-flow · commit 1dad254b9d62 · 2026-02-13T00:28:47.000Z
Add configurable time-decay weighting w(t) = exp(-k * age_days) for ML training examples via ML_WEIGHT_DECAY_K env variable (default 0 = disabled). When enabled, recent messages have higher weight in model training, making the model more responsive to new spam patterns. Also add ml-experiment.fsx script for REPL-based comparison of baseline vs weighted models with per-interval precision/recall analysis.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/src/VahterBanBot/ML.fs b/src/VahterBanBot/ML.fs
@@ -22,6 +22,7 @@ type SpamOrHam =
       spam: bool
       lessThanNMessagesF: single
       moreThanNEmojisF: single
+      weight: single
       createdAt: DateTime }
 
 [<CLIMutable>]
@@ -86,12 +87,20 @@ type MachineLearning(
         
         logger.LogInformation $"Training data count: {rawData.Length}"
         
+        let now = DateTime.UtcNow
+        let k = botConf.MlWeightDecayK
         let data =
             rawData
             |> Array.map (fun x ->
+                let w =
+                    if k > 0.0 then
+                        single (Math.Exp(-k * (now - x.created_at).TotalDays))
+                    else
+                        1.0f
                 { text = x.text
                   spam = x.spam
                   createdAt = x.created_at
+                  weight = w
                   moreThanNEmojisF = if x.custom_emoji_count > botConf.MlCustomEmojiThreshold then 1.0f else 0.0f
                   lessThanNMessagesF = if x.less_than_n_messages then 1.0f else 0.0f }
             )
@@ -105,11 +114,21 @@ type MachineLearning(
         let trainingData = trainTestSplit.TrainSet
         let testData = trainTestSplit.TestSet
         
-        let dataProcessPipeline =
+        let featurePipeline =
             mlContext.Transforms.Text
                 .FeaturizeText(outputColumnName = "TextFeaturized", inputColumnName = "text")
                 .Append(mlContext.Transforms.Concatenate(outputColumnName = "Features", inputColumnNames = [|"TextFeaturized"; "lessThanNMessagesF"; "moreThanNEmojisF"|]))
-                .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
+
+        let dataProcessPipeline =
+            if k > 0.0 then
+                featurePipeline.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
+                    labelColumnName = "spam",
+                    featureColumnName = "Features",
+                    exampleWeightColumnName = "weight",
+                    maximumNumberOfIterations = botConf.MlMaxNumberOfIterations
+                ))
+            else
+                featurePipeline.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
                     labelColumnName = "spam",
                     featureColumnName = "Features",
                     maximumNumberOfIterations = botConf.MlMaxNumberOfIterations
@@ -174,6 +193,7 @@ type MachineLearning(
                       spam = false
                       lessThanNMessagesF = if userMsgCount < botConf.MlTrainCriticalMsgCount then 1.0f else 0.0f
                       moreThanNEmojisF = if emojiCount > botConf.MlCustomEmojiThreshold then 1.0f else 0.0f
+                      weight = 1.0f
                       createdAt = DateTime.UtcNow }
                 |> Some
             | None ->
diff --git a/src/VahterBanBot/Program.fs b/src/VahterBanBot/Program.fs
@@ -97,6 +97,7 @@ let botConf =
       MlMaxNumberOfIterations = getEnvOr "ML_MAX_NUMBER_OF_ITERATIONS" "50" |> int
       MlCustomEmojiThreshold = getEnvOr "ML_CUSTOM_EMOJI_THRESHOLD" "20" |> int
       MlStopWordsInChats = getEnvOr "ML_STOP_WORDS_IN_CHATS" "{}" |> fromJson
+      MlWeightDecayK = getEnvOr "ML_WEIGHT_DECAY_K" "0" |> float
       // Reaction spam detection
       ReactionSpamEnabled = getEnvOr "REACTION_SPAM_ENABLED" "false" |> bool.Parse
       ReactionSpamMinMessages = getEnvOr "REACTION_SPAM_MIN_MESSAGES" "10" |> int
diff --git a/src/VahterBanBot/Types.fs b/src/VahterBanBot/Types.fs
@@ -55,6 +55,8 @@ type BotConfiguration =
       MlMaxNumberOfIterations: int
       MlCustomEmojiThreshold: int
       MlStopWordsInChats: Dictionary<int64, string list>
+      /// Time-decay weight parameter: w(t) = exp(-k * age_in_days). 0 = no decay (all weights 1.0).
+      MlWeightDecayK: float
       // Reaction spam detection
       ReactionSpamEnabled: bool
       ReactionSpamMinMessages: int
diff --git a/src/ml-experiment.fsx b/src/ml-experiment.fsx