From af1b0de4b71d7f2672d5ae0772008e0e1fcf1024 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Fri, 13 Jun 2025 16:55:06 +0200 Subject: [PATCH 01/12] Added QML components for balancing labels in data --- inst/qml/common/ui/BalanceLabels.qml | 38 +++++++++++++++++++ .../mlClassificationLogisticMultinomial.qml | 5 +++ 2 files changed, 43 insertions(+) create mode 100644 inst/qml/common/ui/BalanceLabels.qml diff --git a/inst/qml/common/ui/BalanceLabels.qml b/inst/qml/common/ui/BalanceLabels.qml new file mode 100644 index 00000000..d4f4f837 --- /dev/null +++ b/inst/qml/common/ui/BalanceLabels.qml @@ -0,0 +1,38 @@ +// +// Copyright (C) 2013-2018 University of Amsterdam +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public +// License along with this program. If not, see +// . +// +import QtQuick +import QtQuick.Layouts +import JASP.Controls + +RadioButtonGroup +{ + name: "balanceLabels" + title: qsTr("Whether to have equal distribution of labels in the data") + RadioButton + { + name: "balanced" + text: qsTr("Balance distribution of labels") + checked: true + + } + RadioButton + { + name:"unbalanced" + text: qsTr("Keep original distribution of labels") + } +} \ No newline at end of file diff --git a/inst/qml/mlClassificationLogisticMultinomial.qml b/inst/qml/mlClassificationLogisticMultinomial.qml index 72ed3c41..c3276851 100644 --- a/inst/qml/mlClassificationLogisticMultinomial.qml +++ b/inst/qml/mlClassificationLogisticMultinomial.qml @@ -95,4 +95,9 @@ Form } } } + Section + { + title: qsTr("Balancing Labels") + UI.BalanceLabels { } + } } From b5f64b58a843bc9144bdc50f04b3d11ea1e0bdb0 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 13:37:53 +0200 Subject: [PATCH 02/12] Added functionality to balance targets in R-scripts: 1) Added a common function (in commonMachineLearningClassification.R) called .balance_dataset. 2) Added conditional activation of .balance_dataset in .mlClassificationReadData, depending on options[[balanceLabels]]. 3) Added the dependency in .mlClassificationDependencies. --- R/commonMachineLearningClassification.R | 24 ++++++++++++++++++- inst/qml/common/ui/BalanceLabels.qml | 8 +++---- .../mlClassificationLogisticMultinomial.qml | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index daafcc1b..1e0a1a7b 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -33,7 +33,8 @@ "noOfTrees", "maxTrees", "baggingFraction", "noOfPredictors", "numberOfPredictors", # Random forest "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost", # Support vector machine "smoothingParameter", # Naive Bayes - "intercept", "link" # Logistic + "intercept", "link", + "balanceLabels" # Common ) if (includeSaveOptions) { opt <- c(opt, "saveModel", "savePath") @@ -41,9 +42,30 @@ return(opt) } +.balance_dataset <- function(dataset, options) { + # Extract targets and split data + target <- dataset[, options[["target"]]] + split_data <- split(dataset, target) + + # Determine minimum sample size out of all levels + min_size <- min(sapply(split_data, nrow)) + + # For each level, sample the minimum number of samples, resulting in a balanced dataset + collection <- lapply(split_data, function(df) {df[sample(nrow(df), size = min_size, replace = FALSE), ]}) + balanced_dataset <- do.call(rbind, collection) + + return (balanced_dataset) +} + .mlClassificationReadData <- function(dataset, options) { dataset <- .readDataClassificationRegressionAnalyses(dataset, options, include_weights = FALSE) if (options[["target"]] != "") { + + # Balance Dataset based on selected Target + if (options[["balanceLabels"]] == "balanced") { + dataset <- .balance_dataset(dataset, options) + } + dataset[, options[["target"]]] <- factor(dataset[, options[["target"]]], ordered = FALSE) } return(dataset) diff --git a/inst/qml/common/ui/BalanceLabels.qml b/inst/qml/common/ui/BalanceLabels.qml index d4f4f837..92309afe 100644 --- a/inst/qml/common/ui/BalanceLabels.qml +++ b/inst/qml/common/ui/BalanceLabels.qml @@ -25,14 +25,14 @@ RadioButtonGroup title: qsTr("Whether to have equal distribution of labels in the data") RadioButton { - name: "balanced" - text: qsTr("Balance distribution of labels") + value: "balanced" + label: qsTr("Balance distribution of labels") checked: true } RadioButton { - name:"unbalanced" - text: qsTr("Keep original distribution of labels") + value:"unbalanced" + label: qsTr("Keep original distribution of labels") } } \ No newline at end of file diff --git a/inst/qml/mlClassificationLogisticMultinomial.qml b/inst/qml/mlClassificationLogisticMultinomial.qml index c3276851..f40d98c9 100644 --- a/inst/qml/mlClassificationLogisticMultinomial.qml +++ b/inst/qml/mlClassificationLogisticMultinomial.qml @@ -98,6 +98,7 @@ Form Section { title: qsTr("Balancing Labels") + UI.BalanceLabels { } } } From 10194cd2d9a9661c4e1fdf221daa440e04c30d78 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 14:12:32 +0200 Subject: [PATCH 03/12] changed balance_dataset to .mlBalanceDataset to reflect style. Added dependency for .mlPlotDataSplit --- R/commonMachineLearningRegression.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index 61fd1596..b4fa35e0 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -36,7 +36,7 @@ "mutationMethod", "survivalMethod", "elitismProportion", "candidates", # Neural network "noOfTrees", "maxTrees", "baggingFraction", "noOfPredictors", "numberOfPredictors", # Random forest "convergenceThreshold", "penalty", "alpha", "intercept", "lambda", # Regularized - "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost" # Support vector machine + "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost" # Support vector machine ) if (includeSaveOptions) { opt <- c(opt, "saveModel", "savePath") @@ -166,7 +166,7 @@ if (length(factorsWithNewLevels) > 0) { setType <- switch(type, "test" = gettext("test set"), "validation" = gettext("validation set"), "prediction" = gettext("new dataset")) additionalMessage <- switch(type, - "test" = gettext(" or use a different test set (e.g., automatically by setting a different seed or manually by specifying the test set indicator)"), + "test" = gettext(" or use a different test set (e.g., automatically by setting a different seed or manually by specifying the test set indicator)"), "validation" = gettext(" or use a different validation set by setting a different seed"), "prediction" = "") factorMessage <- paste(sapply(factorsWithNewLevels, function(i) { @@ -597,7 +597,7 @@ } plot <- createJaspPlot(plot = NULL, title = gettext("Data Split"), width = 800, height = 30) plot$position <- position - plot$dependOn(options = c("dataSplitPlot", "target", "predictors", "trainingDataManual", "modelValid", "testSetIndicatorVariable", "testSetIndicator", "validationDataManual", "holdoutData", "testDataManual", "modelOptimization")) + plot$dependOn(options = c("balanceLabels", "dataSplitPlot", "target", "predictors", "trainingDataManual", "modelValid", "testSetIndicatorVariable", "testSetIndicator", "validationDataManual", "holdoutData", "testDataManual", "modelOptimization")) jaspResults[["plotDataSplit"]] <- plot if (!ready) { return() From db3e7dbca9b8f2d719d1a28c0e3613a9c2529612 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 14:12:57 +0200 Subject: [PATCH 04/12] changed balance_dataset to .mlBalanceDataset to reflect style. Added dependency for .mlPlotDataSplit --- R/commonMachineLearningClassification.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 1e0a1a7b..8d1f52f7 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -42,7 +42,7 @@ return(opt) } -.balance_dataset <- function(dataset, options) { +.mlBalanceDataset <- function(dataset, options) { # Extract targets and split data target <- dataset[, options[["target"]]] split_data <- split(dataset, target) @@ -63,7 +63,7 @@ # Balance Dataset based on selected Target if (options[["balanceLabels"]] == "balanced") { - dataset <- .balance_dataset(dataset, options) + dataset <- .mlBalanceDataset(dataset, options) } dataset[, options[["target"]]] <- factor(dataset[, options[["target"]]], ordered = FALSE) From 33ca688f278a82af37f9983370e40592571bfb58 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 14:28:42 +0200 Subject: [PATCH 05/12] Remove accidental changes made. Made style adjustments. --- R/commonMachineLearningClassification.R | 2 +- R/commonMachineLearningRegression.R | 2 +- inst/qml/common/ui/BalanceLabels.qml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 8d1f52f7..46c7f8f1 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -33,7 +33,7 @@ "noOfTrees", "maxTrees", "baggingFraction", "noOfPredictors", "numberOfPredictors", # Random forest "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost", # Support vector machine "smoothingParameter", # Naive Bayes - "intercept", "link", + "intercept", "link", # Logistic "balanceLabels" # Common ) if (includeSaveOptions) { diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index b4fa35e0..43a4049a 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -36,7 +36,7 @@ "mutationMethod", "survivalMethod", "elitismProportion", "candidates", # Neural network "noOfTrees", "maxTrees", "baggingFraction", "noOfPredictors", "numberOfPredictors", # Random forest "convergenceThreshold", "penalty", "alpha", "intercept", "lambda", # Regularized - "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost" # Support vector machine + "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost" # Support vector machine ) if (includeSaveOptions) { opt <- c(opt, "saveModel", "savePath") diff --git a/inst/qml/common/ui/BalanceLabels.qml b/inst/qml/common/ui/BalanceLabels.qml index 92309afe..8042e547 100644 --- a/inst/qml/common/ui/BalanceLabels.qml +++ b/inst/qml/common/ui/BalanceLabels.qml @@ -28,8 +28,8 @@ RadioButtonGroup value: "balanced" label: qsTr("Balance distribution of labels") checked: true - } + RadioButton { value:"unbalanced" From 942ec3c33e295266fca8c186d7ba4cb8d69d7cc3 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 15:13:04 +0200 Subject: [PATCH 06/12] Moved option to balance targets to the data split section. --- R/commonMachineLearningClassification.R | 2 +- inst/qml/common/ui/BalanceLabels.qml | 38 ------------------- inst/qml/common/ui/DataSplit.qml | 6 +++ .../mlClassificationLogisticMultinomial.qml | 6 --- 4 files changed, 7 insertions(+), 45 deletions(-) delete mode 100644 inst/qml/common/ui/BalanceLabels.qml diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 46c7f8f1..0b48d97d 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -62,7 +62,7 @@ if (options[["target"]] != "") { # Balance Dataset based on selected Target - if (options[["balanceLabels"]] == "balanced") { + if (options[["balanceLabels"]] == TRUE) { dataset <- .mlBalanceDataset(dataset, options) } diff --git a/inst/qml/common/ui/BalanceLabels.qml b/inst/qml/common/ui/BalanceLabels.qml deleted file mode 100644 index 8042e547..00000000 --- a/inst/qml/common/ui/BalanceLabels.qml +++ /dev/null @@ -1,38 +0,0 @@ -// -// Copyright (C) 2013-2018 University of Amsterdam -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public -// License along with this program. If not, see -// . -// -import QtQuick -import QtQuick.Layouts -import JASP.Controls - -RadioButtonGroup -{ - name: "balanceLabels" - title: qsTr("Whether to have equal distribution of labels in the data") - RadioButton - { - value: "balanced" - label: qsTr("Balance distribution of labels") - checked: true - } - - RadioButton - { - value:"unbalanced" - label: qsTr("Keep original distribution of labels") - } -} \ No newline at end of file diff --git a/inst/qml/common/ui/DataSplit.qml b/inst/qml/common/ui/DataSplit.qml index 02faac76..034da3ba 100644 --- a/inst/qml/common/ui/DataSplit.qml +++ b/inst/qml/common/ui/DataSplit.qml @@ -157,4 +157,10 @@ Section info: qsTr("Partition the remaining data in *n* parts.") } } + CheckBox + { + name: "balanceLabels" + label: qsTr("Balance sample size of target levels") + info: qsTr("When clicked, the dataset is balanced to have the same sample size for all levels of the target variable.") + } } diff --git a/inst/qml/mlClassificationLogisticMultinomial.qml b/inst/qml/mlClassificationLogisticMultinomial.qml index f40d98c9..72ed3c41 100644 --- a/inst/qml/mlClassificationLogisticMultinomial.qml +++ b/inst/qml/mlClassificationLogisticMultinomial.qml @@ -95,10 +95,4 @@ Form } } } - Section - { - title: qsTr("Balancing Labels") - - UI.BalanceLabels { } - } } From 9c63f09edddd31178995c0024eafdcb9777baf79 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 15:59:27 +0200 Subject: [PATCH 07/12] Added option to balance either through over- or under-sampling --- R/commonMachineLearningClassification.R | 25 +++++++++++++++++++------ R/commonMachineLearningRegression.R | 2 +- inst/qml/common/ui/DataSplit.qml | 11 ++++++++++- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 0b48d97d..f956681d 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -34,7 +34,7 @@ "complexityParameter", "degree", "gamma", "cost", "tolerance", "epsilon", "maxCost", # Support vector machine "smoothingParameter", # Naive Bayes "intercept", "link", # Logistic - "balanceLabels" # Common + "balanceLabels", "balanceSamplingMethod" # Common ) if (includeSaveOptions) { opt <- c(opt, "saveModel", "savePath") @@ -47,12 +47,25 @@ target <- dataset[, options[["target"]]] split_data <- split(dataset, target) - # Determine minimum sample size out of all levels - min_size <- min(sapply(split_data, nrow)) + # Either over- or undersampling, based on user choice + if (options[["balanceSamplingMethod"]] == "minSample") { - # For each level, sample the minimum number of samples, resulting in a balanced dataset - collection <- lapply(split_data, function(df) {df[sample(nrow(df), size = min_size, replace = FALSE), ]}) - balanced_dataset <- do.call(rbind, collection) + # Determine minimum sample size out of all levels + min_size <- min(sapply(split_data, nrow)) + + # For each level, undersample to the minimum sample size found, using without-replacement sampling + collection <- lapply(split_data, function(df) {df[sample(nrow(df), size = min_size, replace = FALSE), ]}) + balanced_dataset <- do.call(rbind, collection) + } + else { + + # Determine minimum sample size out of all levels + max_size <- max(sapply(split_data, nrow)) + + # For each level, oversample to the maximum sample size found, using with-replacement sampling + collection <- lapply(split_data, function(df) {df[sample(nrow(df), size = max_size, replace = TRUE), ]}) + balanced_dataset <- do.call(rbind, collection) + } return (balanced_dataset) } diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index 43a4049a..40ede4c5 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -597,7 +597,7 @@ } plot <- createJaspPlot(plot = NULL, title = gettext("Data Split"), width = 800, height = 30) plot$position <- position - plot$dependOn(options = c("balanceLabels", "dataSplitPlot", "target", "predictors", "trainingDataManual", "modelValid", "testSetIndicatorVariable", "testSetIndicator", "validationDataManual", "holdoutData", "testDataManual", "modelOptimization")) + plot$dependOn(options = c("balanceSamplingMethod", "balanceLabels", "dataSplitPlot", "target", "predictors", "trainingDataManual", "modelValid", "testSetIndicatorVariable", "testSetIndicator", "validationDataManual", "holdoutData", "testDataManual", "modelOptimization")) jaspResults[["plotDataSplit"]] <- plot if (!ready) { return() diff --git a/inst/qml/common/ui/DataSplit.qml b/inst/qml/common/ui/DataSplit.qml index 034da3ba..71db63c4 100644 --- a/inst/qml/common/ui/DataSplit.qml +++ b/inst/qml/common/ui/DataSplit.qml @@ -161,6 +161,15 @@ Section { name: "balanceLabels" label: qsTr("Balance sample size of target levels") - info: qsTr("When clicked, the dataset is balanced to have the same sample size for all levels of the target variable.") + info: qsTr("When clicked, the dataset is balanced to have the same sample size for all levels of the target variable. This is done either through over- or under-sampling") + + RadioButtonGroup + + { + name: "balanceSamplingMethod" + + RadioButton{ value: "minSample"; label: qsTr("Balance using undersampling"); checked: true} + RadioButton{ value: "maxSample"; label: qsTr("Balance using oversampling")} + } } } From d2815880f58d9c7dcbe7c6b7be796ad6bdfd4f3a Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 16:19:14 +0200 Subject: [PATCH 08/12] Removed balance target interface from data split, and put it in a seperate section, because data split is shared in regression analyses, which do not require balancing targets (target is continuous) --- inst/qml/common/ui/BalanceTarget.qml | 41 +++++++++++++++++++ inst/qml/common/ui/DataSplit.qml | 15 ------- .../mlClassificationLogisticMultinomial.qml | 1 + 3 files changed, 42 insertions(+), 15 deletions(-) create mode 100644 inst/qml/common/ui/BalanceTarget.qml diff --git a/inst/qml/common/ui/BalanceTarget.qml b/inst/qml/common/ui/BalanceTarget.qml new file mode 100644 index 00000000..c64da5b3 --- /dev/null +++ b/inst/qml/common/ui/BalanceTarget.qml @@ -0,0 +1,41 @@ +// +// Copyright (C) 2013-2018 University of Amsterdam +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public +// License along with this program. If not, see +// . +// + +import QtQuick +import QtQuick.Layouts +import JASP.Controls +Section +{ + title: qsTr("Balance Target Preferences") + + CheckBox + { + name: "balanceLabels" + label: qsTr("Balance sample size of target levels") + info: qsTr("When clicked, the dataset is balanced to have the same sample size for all levels of the target variable. This is done either through over- or under-sampling") + + RadioButtonGroup + + { + name: "balanceSamplingMethod" + + RadioButton{ value: "minSample"; label: qsTr("Balance using undersampling"); checked: true} + RadioButton{ value: "maxSample"; label: qsTr("Balance using oversampling")} + } + } +} diff --git a/inst/qml/common/ui/DataSplit.qml b/inst/qml/common/ui/DataSplit.qml index 71db63c4..02faac76 100644 --- a/inst/qml/common/ui/DataSplit.qml +++ b/inst/qml/common/ui/DataSplit.qml @@ -157,19 +157,4 @@ Section info: qsTr("Partition the remaining data in *n* parts.") } } - CheckBox - { - name: "balanceLabels" - label: qsTr("Balance sample size of target levels") - info: qsTr("When clicked, the dataset is balanced to have the same sample size for all levels of the target variable. This is done either through over- or under-sampling") - - RadioButtonGroup - - { - name: "balanceSamplingMethod" - - RadioButton{ value: "minSample"; label: qsTr("Balance using undersampling"); checked: true} - RadioButton{ value: "maxSample"; label: qsTr("Balance using oversampling")} - } - } } diff --git a/inst/qml/mlClassificationLogisticMultinomial.qml b/inst/qml/mlClassificationLogisticMultinomial.qml index 72ed3c41..deff04a1 100644 --- a/inst/qml/mlClassificationLogisticMultinomial.qml +++ b/inst/qml/mlClassificationLogisticMultinomial.qml @@ -55,6 +55,7 @@ Form UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } UI.DataSplit { trainingValidationSplit: false } + UI.BalanceTarget {} Section { From c11530e90b4ee145c3a3bdb8f349fec1fc668eeb Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sat, 14 Jun 2025 17:25:06 +0200 Subject: [PATCH 09/12] Put balancing option back into data split. Added alias for visible property, and set it to 'false' for all Regression analyses --- inst/qml/common/ui/BalanceTarget.qml | 41 ------------------- inst/qml/common/ui/DataSplit.qml | 17 ++++++++ .../mlClassificationLogisticMultinomial.qml | 1 - inst/qml/mlRegressionBoosting.qml | 9 +++- inst/qml/mlRegressionDecisionTree.qml | 8 +++- inst/qml/mlRegressionKnn.qml | 2 +- inst/qml/mlRegressionLinear.qml | 2 +- inst/qml/mlRegressionNeuralNetwork.qml | 2 +- inst/qml/mlRegressionRandomForest.qml | 8 +++- inst/qml/mlRegressionRegularized.qml | 1 + inst/qml/mlRegressionSvm.qml | 8 +++- 11 files changed, 49 insertions(+), 50 deletions(-) delete mode 100644 inst/qml/common/ui/BalanceTarget.qml diff --git a/inst/qml/common/ui/BalanceTarget.qml b/inst/qml/common/ui/BalanceTarget.qml deleted file mode 100644 index c64da5b3..00000000 --- a/inst/qml/common/ui/BalanceTarget.qml +++ /dev/null @@ -1,41 +0,0 @@ -// -// Copyright (C) 2013-2018 University of Amsterdam -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public -// License along with this program. If not, see -// . -// - -import QtQuick -import QtQuick.Layouts -import JASP.Controls -Section -{ - title: qsTr("Balance Target Preferences") - - CheckBox - { - name: "balanceLabels" - label: qsTr("Balance sample size of target levels") - info: qsTr("When clicked, the dataset is balanced to have the same sample size for all levels of the target variable. This is done either through over- or under-sampling") - - RadioButtonGroup - - { - name: "balanceSamplingMethod" - - RadioButton{ value: "minSample"; label: qsTr("Balance using undersampling"); checked: true} - RadioButton{ value: "maxSample"; label: qsTr("Balance using oversampling")} - } - } -} diff --git a/inst/qml/common/ui/DataSplit.qml b/inst/qml/common/ui/DataSplit.qml index 02faac76..a3422e02 100644 --- a/inst/qml/common/ui/DataSplit.qml +++ b/inst/qml/common/ui/DataSplit.qml @@ -25,6 +25,7 @@ Section property alias leaveOneOutVisible: leaveOneOut.visible property alias kFoldsVisible: kFolds.visible property alias trainingValidationSplit: trainingValidationSplit.visible + property alias balanceTargetClasses: balanceTargetClasses.visible title: qsTr("Data Split Preferences") @@ -157,4 +158,20 @@ Section info: qsTr("Partition the remaining data in *n* parts.") } } + + CheckBox + { + id: balanceTargetClasses + name: "balanceLabels" + label: qsTr("Balance sample size of target classes") + info: qsTr("When clicked, the dataset is balanced to have the same sample size for all classes of the target variable. This is done either through over- or undersampling") + + RadioButtonGroup + { + name: "balanceSamplingMethod" + + RadioButton{ value: "minSample"; label: qsTr("Undersample"); checked: true; info: qsTr("Balances the target classes by undersampling to match the size of the smallest class.")} + RadioButton{ value: "maxSample"; label: qsTr("Oversample"); info: qsTr("Balances the target classes by oversampling to match the size of the largest class. This is done by sampling with replacement for smaller classes.")} + } + } } diff --git a/inst/qml/mlClassificationLogisticMultinomial.qml b/inst/qml/mlClassificationLogisticMultinomial.qml index deff04a1..72ed3c41 100644 --- a/inst/qml/mlClassificationLogisticMultinomial.qml +++ b/inst/qml/mlClassificationLogisticMultinomial.qml @@ -55,7 +55,6 @@ Form UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } UI.DataSplit { trainingValidationSplit: false } - UI.BalanceTarget {} Section { diff --git a/inst/qml/mlRegressionBoosting.qml b/inst/qml/mlRegressionBoosting.qml index 9e411a5e..70a4ced1 100644 --- a/inst/qml/mlRegressionBoosting.qml +++ b/inst/qml/mlRegressionBoosting.qml @@ -44,7 +44,7 @@ Form { title: qsTr("Plots") - FIG.DataSplit { } + FIG.DataSplit {} FIG.PredictivePerformance { } BOOSTING.Oob { } BOOSTING.Deviance { } @@ -52,7 +52,12 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 1 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { diff --git a/inst/qml/mlRegressionDecisionTree.qml b/inst/qml/mlRegressionDecisionTree.qml index 12c7a596..86a9d6da 100644 --- a/inst/qml/mlRegressionDecisionTree.qml +++ b/inst/qml/mlRegressionDecisionTree.qml @@ -52,7 +52,13 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + kFoldsVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { diff --git a/inst/qml/mlRegressionKnn.qml b/inst/qml/mlRegressionKnn.qml index e95b7f3b..43622606 100644 --- a/inst/qml/mlRegressionKnn.qml +++ b/inst/qml/mlRegressionKnn.qml @@ -52,7 +52,7 @@ Form UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { trainingValidationSplit: !optim.isManual } + UI.DataSplit { trainingValidationSplit: !optim.isManual ; balanceTargetClasses: false } Section { diff --git a/inst/qml/mlRegressionLinear.qml b/inst/qml/mlRegressionLinear.qml index 3dd82086..b6af5548 100644 --- a/inst/qml/mlRegressionLinear.qml +++ b/inst/qml/mlRegressionLinear.qml @@ -50,7 +50,7 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { trainingValidationSplit: false } + UI.DataSplit { trainingValidationSplit: false; balanceTargetClasses: false } Section { diff --git a/inst/qml/mlRegressionNeuralNetwork.qml b/inst/qml/mlRegressionNeuralNetwork.qml index 07e827be..5f125861 100644 --- a/inst/qml/mlRegressionNeuralNetwork.qml +++ b/inst/qml/mlRegressionNeuralNetwork.qml @@ -53,7 +53,7 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false } + UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; balanceTargetClasses: false } Section { diff --git a/inst/qml/mlRegressionRandomForest.qml b/inst/qml/mlRegressionRandomForest.qml index 6c4cd698..80c3b881 100644 --- a/inst/qml/mlRegressionRandomForest.qml +++ b/inst/qml/mlRegressionRandomForest.qml @@ -52,7 +52,13 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 1 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + kFoldsVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { diff --git a/inst/qml/mlRegressionRegularized.qml b/inst/qml/mlRegressionRegularized.qml index ae898b1b..888480f7 100644 --- a/inst/qml/mlRegressionRegularized.qml +++ b/inst/qml/mlRegressionRegularized.qml @@ -85,6 +85,7 @@ Form leaveOneOutVisible: false kFoldsVisible: false trainingValidationSplit: !fixedModel.checked + balanceTargetClasses: false } Section diff --git a/inst/qml/mlRegressionSvm.qml b/inst/qml/mlRegressionSvm.qml index 7e4f62fa..7a39b654 100644 --- a/inst/qml/mlRegressionSvm.qml +++ b/inst/qml/mlRegressionSvm.qml @@ -51,7 +51,13 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + kFoldsVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { From be328e3a85402c0ff0eef2c7adc3bc3f8156a5ba Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sun, 15 Jun 2025 15:43:23 +0200 Subject: [PATCH 10/12] Removed redundancies in .mlBalanceDataset --- R/commonMachineLearningClassification.R | 42 ++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index f956681d..2f6bf3bd 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -43,42 +43,42 @@ } .mlBalanceDataset <- function(dataset, options) { - # Extract targets and split data - target <- dataset[, options[["target"]]] - split_data <- split(dataset, target) - # Either over- or undersampling, based on user choice + # Extract classes and split data into into homogeneous class groups + classes <- dataset[, options[["target"]]] + splitData <- split(dataset, classes) + + # If user chooses not to balance classes, just return the original dataset + if (!isTRUE(options[["balanceLabels"]])){ + return (dataset) + } + + # User chooses undersampling if (options[["balanceSamplingMethod"]] == "minSample") { # Determine minimum sample size out of all levels - min_size <- min(sapply(split_data, nrow)) - - # For each level, undersample to the minimum sample size found, using without-replacement sampling - collection <- lapply(split_data, function(df) {df[sample(nrow(df), size = min_size, replace = FALSE), ]}) - balanced_dataset <- do.call(rbind, collection) + n <- min(sapply(splitData, nrow)) + withReplacement <- FALSE } + + # User chooses oversampling else { # Determine minimum sample size out of all levels - max_size <- max(sapply(split_data, nrow)) - - # For each level, oversample to the maximum sample size found, using with-replacement sampling - collection <- lapply(split_data, function(df) {df[sample(nrow(df), size = max_size, replace = TRUE), ]}) - balanced_dataset <- do.call(rbind, collection) + n <- max(sapply(splitData, nrow)) + withReplacement <- TRUE } - return (balanced_dataset) + # For each level, sample n observations using the chosen method. + balancedSplits <- lapply(splitData, function(df) {df[sample(nrow(df), size = n, replace = withReplacement), ]}) + balanced_dataset <- do.call(rbind, balancedSplits) + + return(balanced_dataset) } .mlClassificationReadData <- function(dataset, options) { dataset <- .readDataClassificationRegressionAnalyses(dataset, options, include_weights = FALSE) if (options[["target"]] != "") { - - # Balance Dataset based on selected Target - if (options[["balanceLabels"]] == TRUE) { - dataset <- .mlBalanceDataset(dataset, options) - } - dataset[, options[["target"]]] <- factor(dataset[, options[["target"]]], ordered = FALSE) } return(dataset) From 4c27e61cc6e137e84e930e5dd936d3d7709119f0 Mon Sep 17 00:00:00 2001 From: coopa33 Date: Sun, 15 Jun 2025 15:46:18 +0200 Subject: [PATCH 11/12] .mlBalanceDataset now only operates on training data. I disabled the feature for analyses requiring validation data, pending decision whether we should balance validation data as well --- R/mlClassificationLda.R | 2 +- R/mlClassificationLogisticMultinomial.R | 4 +++- R/mlClassificationNaiveBayes.R | 2 +- inst/qml/mlClassificationBoosting.qml | 7 ++++++- inst/qml/mlClassificationDecisionTree.qml | 8 +++++++- inst/qml/mlClassificationKnn.qml | 2 +- inst/qml/mlClassificationNeuralNetwork.qml | 2 +- inst/qml/mlClassificationRandomForest.qml | 8 +++++++- inst/qml/mlClassificationSvm.qml | 7 ++++++- 9 files changed, 33 insertions(+), 9 deletions(-) diff --git a/R/mlClassificationLda.R b/R/mlClassificationLda.R index 5d1f27ca..6ea85e10 100644 --- a/R/mlClassificationLda.R +++ b/R/mlClassificationLda.R @@ -110,7 +110,7 @@ mlClassificationLda <- function(jaspResults, dataset, options, ...) { # Sample a percentage of the total data set trainingIndex <- sample.int(nrow(dataset), size = ceiling((1 - options[["testDataManual"]]) * nrow(dataset))) } - trainingSet <- dataset[trainingIndex, ] + trainingSet <- .mlBalanceDataset(dataset[trainingIndex, ], options) testSet <- dataset[-trainingIndex, ] # Check for factor levels in the test set that are not in the training set .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") diff --git a/R/mlClassificationLogisticMultinomial.R b/R/mlClassificationLogisticMultinomial.R index 71f5af13..51555cbb 100644 --- a/R/mlClassificationLogisticMultinomial.R +++ b/R/mlClassificationLogisticMultinomial.R @@ -74,7 +74,9 @@ mlClassificationLogisticMultinomial <- function(jaspResults, dataset, options, . # Sample a percentage of the total data set trainingIndex <- sample.int(nrow(dataset), size = ceiling((1 - options[["testDataManual"]]) * nrow(dataset))) } - trainingSet <- dataset[trainingIndex, ] + # Create training set with optional balanced classes + trainingSet <- .mlBalanceDataset(dataset[trainingIndex, ], options) + # Create the generated test set indicator testIndicatorColumn <- rep(1, nrow(dataset)) testIndicatorColumn[trainingIndex] <- 0 diff --git a/R/mlClassificationNaiveBayes.R b/R/mlClassificationNaiveBayes.R index d2f7a9e0..0602bed2 100644 --- a/R/mlClassificationNaiveBayes.R +++ b/R/mlClassificationNaiveBayes.R @@ -75,7 +75,7 @@ mlClassificationNaiveBayes <- function(jaspResults, dataset, options, ...) { # Sample a percentage of the total data set trainingIndex <- sample.int(nrow(dataset), size = ceiling((1 - options[["testDataManual"]]) * nrow(dataset))) } - trainingSet <- dataset[trainingIndex, ] + trainingSet <- .mlBalanceDataset(dataset[trainingIndex, ], options) # Create the generated test set indicator testIndicatorColumn <- rep(1, nrow(dataset)) testIndicatorColumn[trainingIndex] <- 0 diff --git a/inst/qml/mlClassificationBoosting.qml b/inst/qml/mlClassificationBoosting.qml index 2670c08d..f1310dda 100644 --- a/inst/qml/mlClassificationBoosting.qml +++ b/inst/qml/mlClassificationBoosting.qml @@ -56,7 +56,12 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 1 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { diff --git a/inst/qml/mlClassificationDecisionTree.qml b/inst/qml/mlClassificationDecisionTree.qml index 4da77e1b..82fcda9a 100644 --- a/inst/qml/mlClassificationDecisionTree.qml +++ b/inst/qml/mlClassificationDecisionTree.qml @@ -56,7 +56,13 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + kFoldsVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { diff --git a/inst/qml/mlClassificationKnn.qml b/inst/qml/mlClassificationKnn.qml index 28053f3e..dd8678a4 100644 --- a/inst/qml/mlClassificationKnn.qml +++ b/inst/qml/mlClassificationKnn.qml @@ -55,7 +55,7 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { trainingValidationSplit: !optim.isManual } + UI.DataSplit { trainingValidationSplit: !optim.isManual; balanceTargetClasses: false } Section { diff --git a/inst/qml/mlClassificationNeuralNetwork.qml b/inst/qml/mlClassificationNeuralNetwork.qml index 70fee156..b6a13c68 100644 --- a/inst/qml/mlClassificationNeuralNetwork.qml +++ b/inst/qml/mlClassificationNeuralNetwork.qml @@ -57,7 +57,7 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false } + UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; balanceTargetClasses: false } Section { diff --git a/inst/qml/mlClassificationRandomForest.qml b/inst/qml/mlClassificationRandomForest.qml index 22f7cee1..be91b3df 100644 --- a/inst/qml/mlClassificationRandomForest.qml +++ b/inst/qml/mlClassificationRandomForest.qml @@ -56,7 +56,13 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 1 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + kFoldsVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false + } Section { diff --git a/inst/qml/mlClassificationSvm.qml b/inst/qml/mlClassificationSvm.qml index d3c82ae5..6b430156 100644 --- a/inst/qml/mlClassificationSvm.qml +++ b/inst/qml/mlClassificationSvm.qml @@ -55,7 +55,12 @@ Form } UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 } - UI.DataSplit { leaveOneOutVisible: false; kFoldsVisible: false; trainingValidationSplit: !optim.isManual } + UI.DataSplit + { + leaveOneOutVisible: false + kFoldsVisible: false + trainingValidationSplit: !optim.isManual + balanceTargetClasses: false } Section { From 1bfa7ae27deddf30ac11c7c19381d0bcc463cdde Mon Sep 17 00:00:00 2001 From: coopa33 Date: Mon, 16 Jun 2025 22:24:11 +0200 Subject: [PATCH 12/12] Made style adjustments --- R/commonMachineLearningClassification.R | 34 ++++++++++++------------- inst/qml/common/ui/DataSplit.qml | 16 ++++++++++-- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 2f6bf3bd..a7fbaad4 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -42,38 +42,36 @@ return(opt) } +# Function balancing the size of classes of a discrete dependent variable in a dataset .mlBalanceDataset <- function(dataset, options) { + # To balance the classes, this function uses either under- or oversampling to adjust + # the size of each class to either the minimum or maximum class size found in the data. + # The sampling method is random sampling. + + # Ensures that if the option is not selected, balancing will not occur. + if (!isTRUE(options[["balanceLabels"]])) + return(dataset) - # Extract classes and split data into into homogeneous class groups classes <- dataset[, options[["target"]]] splitData <- split(dataset, classes) - # If user chooses not to balance classes, just return the original dataset - if (!isTRUE(options[["balanceLabels"]])){ - return (dataset) - } - - # User chooses undersampling if (options[["balanceSamplingMethod"]] == "minSample") { - - # Determine minimum sample size out of all levels n <- min(sapply(splitData, nrow)) - withReplacement <- FALSE + replace <- FALSE } - # User chooses oversampling else { - - # Determine minimum sample size out of all levels n <- max(sapply(splitData, nrow)) - withReplacement <- TRUE + replace <- TRUE } - # For each level, sample n observations using the chosen method. - balancedSplits <- lapply(splitData, function(df) {df[sample(nrow(df), size = n, replace = withReplacement), ]}) - balanced_dataset <- do.call(rbind, balancedSplits) + balancedSplits <- lapply( + X = splitData, + FUN = function(df) {df[sample(nrow(df), size = n, replace = replace), ]} + ) + balancedDataset <- do.call(rbind, balancedSplits) - return(balanced_dataset) + return(balancedDataset) } .mlClassificationReadData <- function(dataset, options) { diff --git a/inst/qml/common/ui/DataSplit.qml b/inst/qml/common/ui/DataSplit.qml index a3422e02..06cb0033 100644 --- a/inst/qml/common/ui/DataSplit.qml +++ b/inst/qml/common/ui/DataSplit.qml @@ -170,8 +170,20 @@ Section { name: "balanceSamplingMethod" - RadioButton{ value: "minSample"; label: qsTr("Undersample"); checked: true; info: qsTr("Balances the target classes by undersampling to match the size of the smallest class.")} - RadioButton{ value: "maxSample"; label: qsTr("Oversample"); info: qsTr("Balances the target classes by oversampling to match the size of the largest class. This is done by sampling with replacement for smaller classes.")} + RadioButton + { + value: "minSample" + label: qsTr("Undersample") + checked: true + info: qsTr("Balances the target classes by undersampling to match the size of the smallest class.") + } + + RadioButton + { + value: "maxSample" + label: qsTr("Oversample") + info: qsTr("Balances the target classes by oversampling to match the size of the largest class. This is done by sampling with replacement for smaller classes.") + } } } }