jasp-stats · boutinb · Dec 6, 2024 · Nov 28, 2024 · Nov 30, 2024 · Nov 30, 2024
diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R
@@ -72,10 +72,6 @@
   if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) {
     dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE])
   }
-  # Make sure the test set indicator is numeric
-  if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator")
-    dataset[[options[["testSetIndicatorVariable"]]]] <- as.numeric(dataset[[options[["testSetIndicatorVariable"]]]])
-
   return(dataset)
 }
 
@@ -145,6 +141,40 @@
   }
 }
 
+.checkForNewFactorLevelsInPredictionSet <- function(trainingSet, predictionSet, type, model = NULL) {
+  factorNames <- colnames(predictionSet)[sapply(predictionSet, is.factor)]
+  factorNames <- factorNames[which(factorNames %in% colnames(trainingSet))]
+  factorsWithNewLevels <- character()
+  missingLevelsInTrainingSet <- list()
+  for (i in seq_along(factorNames)) {
+    currentFactor <- factorNames[i]
+    factorLevelsInTrainingSet <- unique(trainingSet[[currentFactor]])
+    factorLevelsInPredictionSet <- unique(predictionSet[[currentFactor]])
+    missingLevelsIndex <- which(!(factorLevelsInPredictionSet %in% factorLevelsInTrainingSet))
+    if (length(missingLevelsIndex) > 0) {
+      if (type != "prediction") {
+        factorsWithNewLevels <- c(factorsWithNewLevels, currentFactor)
+        missingLevelsInTrainingSet[[currentFactor]] <- factorLevelsInPredictionSet[missingLevelsIndex]
+      } else {
+        currentFactor <- model[["jaspVars"]][["decoded"]]$predictors[which(model[["jaspVars"]][["encoded"]]$predictors == currentFactor)]
+        factorsWithNewLevels <- c(factorsWithNewLevels, currentFactor)
+        missingLevelsInTrainingSet[[currentFactor]] <- factorLevelsInPredictionSet[missingLevelsIndex]
+      }
+    }
+  }
+  if (length(factorsWithNewLevels) > 0) {
+    setType <- switch(type, "test" = gettext("test set"), "validation" = gettext("validation set"), "prediction" = gettext("new dataset"))
+    additionalMessage <- switch(type,
+      "test" = gettext(" or use a different test set (e.g., automatically by setting a different seed or manually by specifying the test set indicator)"), 
+      "validation" = gettext(" or use a different validation set by setting a different seed"),
+      "prediction" = "")
+    factorMessage <- paste(sapply(factorsWithNewLevels, function(i) {
+        paste0("Factor: ", i, "; Levels: ", paste(missingLevelsInTrainingSet[[i]], collapse = ", "))
+      }), collapse = "\n")
+    jaspBase:::.quitAnalysis(gettextf("Some factors in the %1$s have levels that do not appear in the training set. Please remove the rows with the following levels from the dataset%2$s.\n\n%3$s", setType, additionalMessage, factorMessage))
+  }
+}
+
 .getCustomErrorChecksKnnBoosting <- function(dataset, options, type) {
   if (!type %in% c("knn", "boosting")) {
     return()

diff --git a/R/mlClassificationBoosting.R b/R/mlClassificationBoosting.R
@@ -98,6 +98,8 @@ mlClassificationBoosting <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     noOfFolds <- 0
     .mlBoostingCheckMinObsNode(options, trainingSet) # Check for min obs in nodes
     fit <- gbm::gbm(
@@ -113,6 +115,10 @@ mlClassificationBoosting <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     if (options[["modelValid"]] == "validationManual") {
       noOfFolds <- 0
     } else if (options[["modelValid"]] == "validationKFold") {

diff --git a/R/mlClassificationDecisionTree.R b/R/mlClassificationDecisionTree.R
@@ -89,6 +89,8 @@ mlClassificationDecisionTree <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     complexityPenalty <- options[["complexityParameter"]]
     trainingFit <- rpart::rpart(
       formula = formula, data = trainingSet, method = "class", x = TRUE, y = TRUE,
@@ -101,6 +103,10 @@ mlClassificationDecisionTree <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     cps <- seq(0, options[["maxComplexityParameter"]], by = 0.01)
     accuracyStore <- trainAccuracyStore <- numeric(length(cps))
     startProgressbar(length(cps))

diff --git a/R/mlClassificationKnn.R b/R/mlClassificationKnn.R
@@ -89,6 +89,8 @@ mlClassificationKnn <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     testFit <- kknn::kknn(
       formula = formula, train = trainingSet, test = testSet, k = options[["noOfNearestNeighbours"]],
       distance = distance, kernel = weights, scale = FALSE
@@ -100,6 +102,10 @@ mlClassificationKnn <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     if (options[["modelValid"]] == "validationManual") {
       nnRange <- 1:options[["maxNearestNeighbors"]]
       accuracyStore <- numeric(length(nnRange))

diff --git a/R/mlClassificationLda.R b/R/mlClassificationLda.R
@@ -112,6 +112,8 @@ mlClassificationLda <- function(jaspResults, dataset, options, ...) {
   }
   trainingSet <- dataset[trainingIndex, ]
   testSet <- dataset[-trainingIndex, ]
+  # Check for factor levels in the test set that are not in the training set
+  .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
   # Create the generated test set indicator
   testIndicatorColumn <- rep(1, nrow(dataset))
   testIndicatorColumn[trainingIndex] <- 0

diff --git a/R/mlClassificationLogisticMultinomial.R b/R/mlClassificationLogisticMultinomial.R
@@ -80,6 +80,8 @@ mlClassificationLogisticMultinomial <- function(jaspResults, dataset, options, .
   testIndicatorColumn[trainingIndex] <- 0
   # Just create a train and a test set (no optimization)
   testSet <- dataset[-trainingIndex, ]
+  # Check for factor levels in the test set that are not in the training set
+  .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
   # Create the formula
   if (options[["intercept"]]) {
     formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + ")))

diff --git a/R/mlClassificationNaiveBayes.R b/R/mlClassificationNaiveBayes.R
@@ -81,6 +81,8 @@ mlClassificationNaiveBayes <- function(jaspResults, dataset, options, ...) {
   testIndicatorColumn[trainingIndex] <- 0
   # Just create a train and a test set (no optimization)
   testSet <- dataset[-trainingIndex, ]
+  # Check for factor levels in the test set that are not in the training set
+  .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
   trainingFit <- e1071::naiveBayes(formula, data = trainingSet, laplace = options[["smoothingParameter"]])
   # Use the specified model to make predictions for dataset
   testPredictions <- predict(trainingFit, newdata = testSet, type = "class")

diff --git a/R/mlClassificationNeuralNetwork.R b/R/mlClassificationNeuralNetwork.R
@@ -95,6 +95,8 @@ mlClassificationNeuralNetwork <- function(jaspResults, dataset, options, ...) {
   if (options[["modelOptimization"]] == "manual") {
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     structure <- .getNeuralNetworkStructure(options)
     p <- try({
       fit <- neuralnet::neuralnet(
@@ -120,6 +122,10 @@ mlClassificationNeuralNetwork <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     accuracyStore <- numeric(options[["maxGenerations"]])
     trainAccuracyStore <- numeric(options[["maxGenerations"]])
     # For plotting

diff --git a/R/mlClassificationRandomForest.R b/R/mlClassificationRandomForest.R
@@ -92,6 +92,8 @@ mlClassificationRandomForest <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     testFit <- randomForest::randomForest(
       x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]],
       xtest = testSet[, options[["predictors"]]], ytest = testSet[, options[["target"]]],
@@ -106,6 +108,10 @@ mlClassificationRandomForest <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     validationFit <- randomForest::randomForest(
       x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]],
       xtest = validationSet[, options[["predictors"]]], ytest = validationSet[, options[["target"]]],

diff --git a/R/mlClassificationSvm.R b/R/mlClassificationSvm.R
@@ -86,6 +86,8 @@ mlClassificationSvm <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     cost <- options[["cost"]]
     trainingFit <- e1071::svm(
       formula = formula, data = trainingSet, type = "C-classification", kernel = options[["weights"]], cost = cost, tolerance = options[["tolerance"]],
@@ -97,6 +99,10 @@ mlClassificationSvm <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     costs <- seq(0.01, options[["maxCost"]], 0.01)
     accuracyStore <- trainAccuracyStore <- numeric(length(costs))
     startProgressbar(length(costs))

diff --git a/R/mlPrediction.R b/R/mlPrediction.R
@@ -221,7 +221,7 @@ is.jaspMachineLearning <- function(x) {
 
 # also define methods for other objects
 .mlPredictionReady <- function(model, dataset, options) {
-  if (!is.null(model)) {
+  if (!is.null(model) && !is.null(dataset)) {
     modelVars <- model[["jaspVars"]][["encoded"]]$predictors
     presentVars <- colnames(dataset)
     ready <- all(modelVars %in% presentVars)
@@ -241,12 +241,25 @@ is.jaspMachineLearning <- function(x) {
 }
 
 .mlPredictionReadData <- function(dataset, options, model) {
-  dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]])
-  if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) {
-    dataset <- .scaleNumericData(dataset)
+  if (length(options[["predictors"]]) == 0) {
+    dataset <- NULL
+  } else {
+    dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]])
+    if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) {
+      dataset <- .scaleNumericData(dataset)
+    }
+    # Select only the predictors in the model to prevent accidental double column names
+    dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors)]
+    # Ensure the column names in the dataset match those in the training data
+    colnames(dataset) <- .matchDecodedNames(colnames(dataset), model)
+    # Retrieve the training set
+    trainingSet <- model[["explainer"]]$data
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, dataset, "prediction", model)
+    # Ensure factor variables in dataset have same levels as those in the training data
+    factorColumns <- colnames(dataset)[sapply(dataset, is.factor)]
+    dataset[factorColumns] <- lapply(factorColumns, function(i) factor(dataset[[i]], levels = levels(trainingSet[[i]])))
   }
-  dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors)] # Filter only predictors to prevent accidental double column names
-  colnames(dataset) <- .matchDecodedNames(colnames(dataset), model)
   return(dataset)
 }
 

diff --git a/R/mlRegressionBoosting.R b/R/mlRegressionBoosting.R
@@ -86,6 +86,8 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     noOfFolds <- 0
     .mlBoostingCheckMinObsNode(options, trainingSet) # Check for min obs in nodes
     trainingFit <- gbm::gbm(
@@ -101,6 +103,10 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     if (options[["modelValid"]] == "validationManual") {
       noOfFolds <- 0
     } else if (options[["modelValid"]] == "validationKFold") {

diff --git a/R/mlRegressionDecisionTree.R b/R/mlRegressionDecisionTree.R
@@ -77,6 +77,8 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     complexityPenalty <- options[["complexityParameter"]]
     trainingFit <- rpart::rpart(
       formula = formula, data = trainingSet, method = "anova", x = TRUE, y = TRUE,
@@ -89,6 +91,10 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     cps <- seq(0, options[["maxComplexityParameter"]], by = 0.01)
     errorStore <- trainErrorStore <- numeric(length(cps))
     startProgressbar(length(cps))

diff --git a/R/mlRegressionKnn.R b/R/mlRegressionKnn.R
@@ -77,6 +77,8 @@ mlRegressionKnn <- function(jaspResults, dataset, options, state = NULL) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     testFit <- kknn::kknn(
       formula = formula, train = trainingSet, test = testSet, k = options[["noOfNearestNeighbours"]],
       distance = distance, kernel = weights, scale = FALSE
@@ -88,6 +90,10 @@ mlRegressionKnn <- function(jaspResults, dataset, options, state = NULL) {
     testSet <- dataset[-trainingIndex, ]
     validationSet <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
     if (options[["modelValid"]] == "validationManual") {
       nnRange <- 1:options[["maxNearestNeighbors"]]
       errorStore <- numeric(length(nnRange))

diff --git a/R/mlRegressionLinear.R b/R/mlRegressionLinear.R
@@ -68,6 +68,8 @@ mlRegressionLinear <- function(jaspResults, dataset, options, ...) {
   # Just create a train and a test set (no optimization)
   trainingSet <- trainingAndValidationSet
   testSet <- dataset[-trainingIndex, ]
+  # Check for factor levels in the test set that are not in the training set
+  .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
   if (options[["intercept"]]) {
     formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + ")))
   } else {

diff --git a/R/mlRegressionNeuralNetwork.R b/R/mlRegressionNeuralNetwork.R
@@ -140,6 +140,8 @@ mlRegressionNeuralNetwork <- function(jaspResults, dataset, options, ...) {
     # Just create a train and a test set (no optimization)
     trainingSet <- trainingAndValidationSet
     testSet <- dataset[-trainingIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
     # Structure of neural network
     structure <- .getNeuralNetworkStructure(options)
     p <- try({
@@ -167,6 +169,10 @@ mlRegressionNeuralNetwork <- function(jaspResults, dataset, options, ...) {
     testSet <- dataset[-trainingIndex, ]
     valid <- trainingAndValidationSet[validationIndex, ]
     trainingSet <- trainingAndValidationSet[-validationIndex, ]
+    # Check for factor levels in the test set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
+    # Check for factor levels in the validation set that are not in the training set
+    .checkForNewFactorLevelsInPredictionSet(trainingSet, valid, "validation")
     errorStore <- numeric(options[["maxGenerations"]])
     trainErrorStore <- numeric(options[["maxGenerations"]])
     # For plotting