Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions src/VahterBanBot/ML.fs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type SpamOrHam =
spam: bool
lessThanNMessagesF: single
moreThanNEmojisF: single
weight: single
createdAt: DateTime }

[<CLIMutable>]
Expand Down Expand Up @@ -86,12 +87,20 @@ type MachineLearning(

logger.LogInformation $"Training data count: {rawData.Length}"

let now = DateTime.UtcNow
let k = botConf.MlWeightDecayK
let data =
rawData
|> Array.map (fun x ->
let w =
if k > 0.0 then
single (Math.Exp(-k * (now - x.created_at).TotalDays))
else
1.0f
{ text = x.text
spam = x.spam
createdAt = x.created_at
weight = w
moreThanNEmojisF = if x.custom_emoji_count > botConf.MlCustomEmojiThreshold then 1.0f else 0.0f
lessThanNMessagesF = if x.less_than_n_messages then 1.0f else 0.0f }
)
Expand All @@ -105,11 +114,21 @@ type MachineLearning(
let trainingData = trainTestSplit.TrainSet
let testData = trainTestSplit.TestSet

let dataProcessPipeline =
let featurePipeline =
mlContext.Transforms.Text
.FeaturizeText(outputColumnName = "TextFeaturized", inputColumnName = "text")
.Append(mlContext.Transforms.Concatenate(outputColumnName = "Features", inputColumnNames = [|"TextFeaturized"; "lessThanNMessagesF"; "moreThanNEmojisF"|]))
.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(

let dataProcessPipeline =
if k > 0.0 then
featurePipeline.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
labelColumnName = "spam",
featureColumnName = "Features",
exampleWeightColumnName = "weight",
maximumNumberOfIterations = botConf.MlMaxNumberOfIterations
))
else
featurePipeline.Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
labelColumnName = "spam",
featureColumnName = "Features",
maximumNumberOfIterations = botConf.MlMaxNumberOfIterations
Expand Down Expand Up @@ -174,6 +193,7 @@ type MachineLearning(
spam = false
lessThanNMessagesF = if userMsgCount < botConf.MlTrainCriticalMsgCount then 1.0f else 0.0f
moreThanNEmojisF = if emojiCount > botConf.MlCustomEmojiThreshold then 1.0f else 0.0f
weight = 1.0f
createdAt = DateTime.UtcNow }
|> Some
| None ->
Expand Down
1 change: 1 addition & 0 deletions src/VahterBanBot/Program.fs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ let botConf =
MlMaxNumberOfIterations = getEnvOr "ML_MAX_NUMBER_OF_ITERATIONS" "50" |> int
MlCustomEmojiThreshold = getEnvOr "ML_CUSTOM_EMOJI_THRESHOLD" "20" |> int
MlStopWordsInChats = getEnvOr "ML_STOP_WORDS_IN_CHATS" "{}" |> fromJson
MlWeightDecayK = getEnvOr "ML_WEIGHT_DECAY_K" "0" |> float
// Reaction spam detection
ReactionSpamEnabled = getEnvOr "REACTION_SPAM_ENABLED" "false" |> bool.Parse
ReactionSpamMinMessages = getEnvOr "REACTION_SPAM_MIN_MESSAGES" "10" |> int
Expand Down
2 changes: 2 additions & 0 deletions src/VahterBanBot/Types.fs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ type BotConfiguration =
MlMaxNumberOfIterations: int
MlCustomEmojiThreshold: int
MlStopWordsInChats: Dictionary<int64, string list>
/// Time-decay weight parameter: w(t) = exp(-k * age_in_days). 0 = no decay (all weights 1.0).
MlWeightDecayK: float
// Reaction spam detection
ReactionSpamEnabled: bool
ReactionSpamMinMessages: int
Expand Down
Loading