From 4d5fd7375bd3b68e1ccf582041abe523933d7afe Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Wed, 26 Mar 2025 23:47:11 +0100 Subject: [PATCH 1/4] Class probabilities in prediction analysis --- R/mlPrediction.R | 112 +++++++++++++++++++-------- inst/qml/common/ui/ExportResults.qml | 10 +++ inst/qml/mlPrediction.qml | 1 + 3 files changed, 90 insertions(+), 33 deletions(-) diff --git a/R/mlPrediction.R b/R/mlPrediction.R index 5128061f..ea3f0008 100644 --- a/R/mlPrediction.R +++ b/R/mlPrediction.R @@ -88,74 +88,103 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionGetPredictions.kknn <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - as.character(kknn:::predict.train.kknn(model[["predictive"]], dataset)) + hard <- as.character(kknn:::predict.train.kknn(model[["predictive"]], dataset)) + soft <- kknn:::predict.train.kknn(model[["predictive"]], dataset, type = "prob") + return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { - as.numeric(kknn:::predict.train.kknn(model[["predictive"]], dataset)) + hard <- as.numeric(kknn:::predict.train.kknn(model[["predictive"]], dataset)) + return(list(hard)) } } .mlPredictionGetPredictions.lda <- function(model, dataset) { - as.character(MASS:::predict.lda(model, newdata = dataset)$class) + hard <- as.character(MASS:::predict.lda(model, newdata = dataset)$class) + soft <- MASS:::predict.lda(model, newdata = dataset)$posterior + return(list(hard, soft)) } .mlPredictionGetPredictions.lm <- function(model, dataset) { - as.numeric(predict(model, newdata = dataset)) + hard <- as.numeric(predict(model, newdata = dataset)) + return(list(hard)) } .mlPredictionGetPredictions.gbm <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - tmp <- gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response") - as.character(colnames(tmp)[apply(tmp, 1, which.max)]) + soft <- gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response") + hard <- as.character(colnames(soft)[apply(soft, 1, which.max)]) + return(list(hard, soft[, , 1])) } else if (inherits(model, "jaspRegression")) { - as.numeric(gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response")) + hard <- as.numeric(gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response")) + return(list(hard)) } } .mlPredictionGetPredictions.randomForest <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - as.character(randomForest:::predict.randomForest(model, newdata = dataset)) + hard <- as.character(randomForest:::predict.randomForest(model, newdata = dataset)) + soft <- predict(model, newdata = dataset, type = "prob") + return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { - as.numeric(randomForest:::predict.randomForest(model, newdata = dataset)) + hard <- as.numeric(randomForest:::predict.randomForest(model, newdata = dataset)) + return(list(hard)) } } .mlPredictionGetPredictions.cv.glmnet <- function(model, dataset) { - as.numeric(glmnet:::predict.cv.glmnet(model, newx = data.matrix(dataset))) + hard <- as.numeric(glmnet:::predict.cv.glmnet(model, newx = data.matrix(dataset))) + return(list(hard)) } .mlPredictionGetPredictions.nn <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - as.character(levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]]))[max.col(neuralnet:::predict.nn(model, newdata = dataset))]) + soft <- neuralnet:::predict.nn(model, newdata = dataset) + colnames(soft) <- levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]])) + hard <- colnames(soft)[apply(soft, 1, which.max)] + return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { - as.numeric(neuralnet:::predict.nn(model, newdata = dataset)) + hard <- as.numeric(neuralnet:::predict.nn(model, newdata = dataset)) + return(list(hard)) } } .mlPredictionGetPredictions.rpart <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - as.character(levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]]))[max.col(rpart:::predict.rpart(model, newdata = dataset))]) + soft <- rpart:::predict.rpart(model, newdata = dataset) + colnames(soft) <- levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]])) + hard <- colnames(soft)[apply(soft, 1, which.max)] + return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { - as.numeric(rpart:::predict.rpart(model, newdata = dataset)) + hard <- as.numeric(rpart:::predict.rpart(model, newdata = dataset)) + return(list(hard)) } } .mlPredictionGetPredictions.svm <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - as.character(levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]]))[e1071:::predict.svm(model, newdata = dataset)]) + soft <- attr(e1071:::predict.svm(model, newdata = dataset, probability = TRUE), "probabilities") + hard <- as.character(e1071:::predict.svm(model, newdata = dataset)) + return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { - as.numeric(e1071:::predict.svm(model, newdata = dataset)) + hard <- as.numeric(e1071:::predict.svm(model, newdata = dataset)) + return(list(hard)) } } .mlPredictionGetPredictions.naiveBayes <- function(model, dataset) { - as.character(e1071:::predict.naiveBayes(model, newdata = dataset, type = "class")) + soft <- e1071:::predict.naiveBayes(model, newdata = dataset, type = "raw") + hard <- as.character(e1071:::predict.naiveBayes(model, newdata = dataset, type = "class")) + return(list(hard, soft)) } .mlPredictionGetPredictions.glm <- function(model, dataset) { - as.character(levels(as.factor(model$model[[model[["jaspVars"]][["encoded"]]$target]]))[round(predict(model, newdata = dataset, type = "response"), 0) + 1]) + probs <- predict(model, newdata = dataset, type = "response") + soft <- matrix(c(1 - probs, probs), ncol = 2) + colnames(soft) <- levels(as.factor(model$model[[model[["jaspVars"]][["encoded"]]$target]])) + hard <- colnames(soft)[apply(soft, 1, which.max)] + return(list(hard, soft)) } .mlPredictionGetPredictions.vglm <- function(model, dataset) { logodds <- predict(model[["original"]], newdata = dataset) ncategories <- ncol(logodds) + 1 - probabilities <- matrix(0, nrow = nrow(logodds), ncol = ncategories) + soft <- matrix(0, nrow = nrow(logodds), ncol = ncategories) for (i in seq_len(ncategories - 1)) { - probabilities[, i] <- exp(logodds[, i]) + soft[, i] <- exp(logodds[, i]) } - probabilities[, ncategories] <- 1 - row_sums <- rowSums(probabilities) - probabilities <- probabilities / row_sums - predicted_columns <- apply(probabilities, 1, which.max) - as.character(levels(as.factor(model$target))[predicted_columns]) + soft[, ncategories] <- 1 + soft <- soft / rowSums(soft) + colnames(soft) <- as.character(levels(as.factor(model$target))) + hard <- colnames(soft)[apply(soft, 1, which.max)] + return(list(hard, soft)) } # S3 method to make find out number of observations in training data @@ -372,7 +401,7 @@ is.jaspMachineLearning <- function(x) { if (!ready) { return() } - predictions <- .mlPredictionsState(model, dataset, options, jaspResults, ready) + predictions <- .mlPredictionsState(model, dataset, options, jaspResults, ready)[[1]] indexes <- options[["fromIndex"]]:options[["toIndex"]] selection <- predictions[indexes] cols <- list(row = indexes, pred = selection) @@ -397,12 +426,29 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionsAddPredictions <- function(model, dataset, options, jaspResults, ready) { - if (options[["addPredictions"]] && is.null(jaspResults[["predictionsColumn"]]) && options[["predictionsColumn"]] != "" && ready) { - predictionsColumn <- rep(NA, max(as.numeric(rownames(dataset)))) - predictionsColumn[as.numeric(rownames(dataset))] <- .mlPredictionsState(model, dataset, options, jaspResults, ready) - jaspResults[["predictionsColumn"]] <- createJaspColumn(columnName = options[["predictionsColumn"]]) - jaspResults[["predictionsColumn"]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "scaleVariables", "addPredictions")) - if (inherits(model, "jaspClassification")) jaspResults[["predictionsColumn"]]$setNominal(predictionsColumn) - if (inherits(model, "jaspRegression")) jaspResults[["predictionsColumn"]]$setScale(predictionsColumn) + if (options[["addPredictions"]] && options[["predictionsColumn"]] != "" && ready) { + predictions <- .mlPredictionsState(model, dataset, options, jaspResults, ready) + # Add hard predictions for regression and classification + if (is.null(jaspResults[["predictionsColumn"]])) { + predictionsColumn <- rep(NA, max(as.numeric(rownames(dataset)))) + predictionsColumn[as.numeric(rownames(dataset))] <- predictions[[1]] + jaspResults[["predictionsColumn"]] <- createJaspColumn(columnName = options[["predictionsColumn"]]) + jaspResults[["predictionsColumn"]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "scaleVariables", "addPredictions")) + if (inherits(model, "jaspClassification")) jaspResults[["predictionsColumn"]]$setNominal(predictionsColumn) + if (inherits(model, "jaspRegression")) jaspResults[["predictionsColumn"]]$setScale(predictionsColumn) + } + # Add predicted probabilities for classification only + if (inherits(model, "jaspClassification") && options[["addProbabilities"]]) { + classNames <- colnames(predictions[[2]]) + for (i in seq_along(classNames)) { + colName <- paste0(decodeColNames(options[["predictionsColumn"]]), "_", classNames[i]) + if (!is.null(jaspResults[[colName]])) { + break + } + jaspResults[[colName]] <- createJaspColumn(columnName = colName) + jaspResults[[colName]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "scaleVariables", "addPredictions", "addProbabilities")) + jaspResults[[colName]]$setScale(predictions[[2]][, i]) + } + } } } diff --git a/inst/qml/common/ui/ExportResults.qml b/inst/qml/common/ui/ExportResults.qml index c7401e84..43a2158c 100644 --- a/inst/qml/common/ui/ExportResults.qml +++ b/inst/qml/common/ui/ExportResults.qml @@ -24,6 +24,7 @@ Group { property alias enabled: exportSection.enabled property alias showSave: saveGroup.visible + property bool showProbs: false id: exportSection title: qsTr("Export Results") @@ -45,6 +46,15 @@ Group enabled: addPredictions.checked info: qsTr("The column name for the predicted values.") } + + CheckBox + { + id: probabilities + name: "addProbabilities" + text: qsTr("Add probabilities (classification only)") + visible: showProbs + info: qsTr("In classification analyses, also add the predicted probabilities for each class to the data.") + } } Group diff --git a/inst/qml/mlPrediction.qml b/inst/qml/mlPrediction.qml index c0c5cb64..536b7c04 100644 --- a/inst/qml/mlPrediction.qml +++ b/inst/qml/mlPrediction.qml @@ -116,5 +116,6 @@ Form UI.ExportResults { enabled: predictors.count > 1 showSave: false + showProbs: true } } From f3af3a07a0032c748dec3178432b7c2bcc533139 Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Thu, 27 Mar 2025 08:15:26 +0100 Subject: [PATCH 2/4] More explicit tie breaking --- R/mlPrediction.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/mlPrediction.R b/R/mlPrediction.R index ea3f0008..d32786a2 100644 --- a/R/mlPrediction.R +++ b/R/mlPrediction.R @@ -88,8 +88,8 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionGetPredictions.kknn <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - hard <- as.character(kknn:::predict.train.kknn(model[["predictive"]], dataset)) soft <- kknn:::predict.train.kknn(model[["predictive"]], dataset, type = "prob") + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { hard <- as.numeric(kknn:::predict.train.kknn(model[["predictive"]], dataset)) @@ -97,8 +97,8 @@ is.jaspMachineLearning <- function(x) { } } .mlPredictionGetPredictions.lda <- function(model, dataset) { - hard <- as.character(MASS:::predict.lda(model, newdata = dataset)$class) soft <- MASS:::predict.lda(model, newdata = dataset)$posterior + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } .mlPredictionGetPredictions.lm <- function(model, dataset) { @@ -107,9 +107,9 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionGetPredictions.gbm <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - soft <- gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response") - hard <- as.character(colnames(soft)[apply(soft, 1, which.max)]) - return(list(hard, soft[, , 1])) + soft <- gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response")[, , 1] + hard <- colnames(soft)[max.col(soft, ties.method = "random")] + return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { hard <- as.numeric(gbm:::predict.gbm(model, newdata = dataset, n.trees = model[["n.trees"]], type = "response")) return(list(hard)) @@ -117,8 +117,8 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionGetPredictions.randomForest <- function(model, dataset) { if (inherits(model, "jaspClassification")) { - hard <- as.character(randomForest:::predict.randomForest(model, newdata = dataset)) soft <- predict(model, newdata = dataset, type = "prob") + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { hard <- as.numeric(randomForest:::predict.randomForest(model, newdata = dataset)) @@ -133,7 +133,7 @@ is.jaspMachineLearning <- function(x) { if (inherits(model, "jaspClassification")) { soft <- neuralnet:::predict.nn(model, newdata = dataset) colnames(soft) <- levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]])) - hard <- colnames(soft)[apply(soft, 1, which.max)] + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { hard <- as.numeric(neuralnet:::predict.nn(model, newdata = dataset)) @@ -144,7 +144,7 @@ is.jaspMachineLearning <- function(x) { if (inherits(model, "jaspClassification")) { soft <- rpart:::predict.rpart(model, newdata = dataset) colnames(soft) <- levels(factor(model[["data"]][[model[["jaspVars"]][["encoded"]]$target]])) - hard <- colnames(soft)[apply(soft, 1, which.max)] + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { hard <- as.numeric(rpart:::predict.rpart(model, newdata = dataset)) @@ -154,7 +154,7 @@ is.jaspMachineLearning <- function(x) { .mlPredictionGetPredictions.svm <- function(model, dataset) { if (inherits(model, "jaspClassification")) { soft <- attr(e1071:::predict.svm(model, newdata = dataset, probability = TRUE), "probabilities") - hard <- as.character(e1071:::predict.svm(model, newdata = dataset)) + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } else if (inherits(model, "jaspRegression")) { hard <- as.numeric(e1071:::predict.svm(model, newdata = dataset)) @@ -163,14 +163,14 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionGetPredictions.naiveBayes <- function(model, dataset) { soft <- e1071:::predict.naiveBayes(model, newdata = dataset, type = "raw") - hard <- as.character(e1071:::predict.naiveBayes(model, newdata = dataset, type = "class")) + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } .mlPredictionGetPredictions.glm <- function(model, dataset) { probs <- predict(model, newdata = dataset, type = "response") soft <- matrix(c(1 - probs, probs), ncol = 2) colnames(soft) <- levels(as.factor(model$model[[model[["jaspVars"]][["encoded"]]$target]])) - hard <- colnames(soft)[apply(soft, 1, which.max)] + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } .mlPredictionGetPredictions.vglm <- function(model, dataset) { @@ -183,7 +183,7 @@ is.jaspMachineLearning <- function(x) { soft[, ncategories] <- 1 soft <- soft / rowSums(soft) colnames(soft) <- as.character(levels(as.factor(model$target))) - hard <- colnames(soft)[apply(soft, 1, which.max)] + hard <- colnames(soft)[max.col(soft, ties.method = "random")] return(list(hard, soft)) } From 2346fe1972ced416587d504c0edde64255b90153 Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Thu, 27 Mar 2025 08:30:28 +0100 Subject: [PATCH 3/4] Make sure shap table works --- R/commonMachineLearningRegression.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index 8e990f44..90013c24 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -786,7 +786,7 @@ } else { explainer <- model[["explainer"]] x_test <- dataset[, predictors] - predictions <- .mlPredictionsState(model, dataset, options, jaspResults, ready)[options[["fromIndex"]]:options[["toIndex"]]] + predictions <- .mlPredictionsState(model, dataset, options, jaspResults, ready)[[1]][options[["fromIndex"]]:options[["toIndex"]]] } from <- min(c(options[["fromIndex"]], options[["toIndex"]] - 1, nrow(x_test))) to <- min(c(options[["toIndex"]], nrow(x_test))) From 6e5752881c10790629e90150416e5869b0836d47 Mon Sep 17 00:00:00 2001 From: Koen Derks Date: Thu, 27 Mar 2025 08:33:57 +0100 Subject: [PATCH 4/4] Update info --- inst/qml/common/ui/ExportResults.qml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/qml/common/ui/ExportResults.qml b/inst/qml/common/ui/ExportResults.qml index 43a2158c..13389d47 100644 --- a/inst/qml/common/ui/ExportResults.qml +++ b/inst/qml/common/ui/ExportResults.qml @@ -53,7 +53,7 @@ Group name: "addProbabilities" text: qsTr("Add probabilities (classification only)") visible: showProbs - info: qsTr("In classification analyses, also add the predicted probabilities for each class to the data.") + info: qsTr("In classification analyses, append the predicted probabilities for each class to the data. For neural networks, this option provides the output of the final layer.") } }