diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index c3edee3a..2adc1282 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -72,10 +72,6 @@ if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) } - # Make sure the test set indicator is numeric - if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator") - dataset[[options[["testSetIndicatorVariable"]]]] <- as.numeric(dataset[[options[["testSetIndicatorVariable"]]]]) - return(dataset) } @@ -145,6 +141,40 @@ } } +.checkForNewFactorLevelsInPredictionSet <- function(trainingSet, predictionSet, type, model = NULL) { + factorNames <- colnames(predictionSet)[sapply(predictionSet, is.factor)] + factorNames <- factorNames[which(factorNames %in% colnames(trainingSet))] + factorsWithNewLevels <- character() + missingLevelsInTrainingSet <- list() + for (i in seq_along(factorNames)) { + currentFactor <- factorNames[i] + factorLevelsInTrainingSet <- unique(trainingSet[[currentFactor]]) + factorLevelsInPredictionSet <- unique(predictionSet[[currentFactor]]) + missingLevelsIndex <- which(!(factorLevelsInPredictionSet %in% factorLevelsInTrainingSet)) + if (length(missingLevelsIndex) > 0) { + if (type != "prediction") { + factorsWithNewLevels <- c(factorsWithNewLevels, currentFactor) + missingLevelsInTrainingSet[[currentFactor]] <- factorLevelsInPredictionSet[missingLevelsIndex] + } else { + currentFactor <- model[["jaspVars"]][["decoded"]]$predictors[which(model[["jaspVars"]][["encoded"]]$predictors == currentFactor)] + factorsWithNewLevels <- c(factorsWithNewLevels, currentFactor) + missingLevelsInTrainingSet[[currentFactor]] <- factorLevelsInPredictionSet[missingLevelsIndex] + } + } + } + if (length(factorsWithNewLevels) > 0) { + setType <- switch(type, "test" = gettext("test set"), "validation" = gettext("validation set"), "prediction" = gettext("new dataset")) + additionalMessage <- switch(type, + "test" = gettext(" or use a different test set (e.g., automatically by setting a different seed or manually by specifying the test set indicator)"), + "validation" = gettext(" or use a different validation set by setting a different seed"), + "prediction" = "") + factorMessage <- paste(sapply(factorsWithNewLevels, function(i) { + paste0("Factor: ", i, "; Levels: ", paste(missingLevelsInTrainingSet[[i]], collapse = ", ")) + }), collapse = "\n") + jaspBase:::.quitAnalysis(gettextf("Some factors in the %1$s have levels that do not appear in the training set. Please remove the rows with the following levels from the dataset%2$s.\n\n%3$s", setType, additionalMessage, factorMessage)) + } +} + .getCustomErrorChecksKnnBoosting <- function(dataset, options, type) { if (!type %in% c("knn", "boosting")) { return() diff --git a/R/mlClassificationBoosting.R b/R/mlClassificationBoosting.R index 1c37f90a..8edcfa69 100644 --- a/R/mlClassificationBoosting.R +++ b/R/mlClassificationBoosting.R @@ -98,6 +98,8 @@ mlClassificationBoosting <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") noOfFolds <- 0 .mlBoostingCheckMinObsNode(options, trainingSet) # Check for min obs in nodes fit <- gbm::gbm( @@ -113,6 +115,10 @@ mlClassificationBoosting <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") if (options[["modelValid"]] == "validationManual") { noOfFolds <- 0 } else if (options[["modelValid"]] == "validationKFold") { diff --git a/R/mlClassificationDecisionTree.R b/R/mlClassificationDecisionTree.R index 3c3f31b9..cef12cf9 100644 --- a/R/mlClassificationDecisionTree.R +++ b/R/mlClassificationDecisionTree.R @@ -89,6 +89,8 @@ mlClassificationDecisionTree <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") complexityPenalty <- options[["complexityParameter"]] trainingFit <- rpart::rpart( formula = formula, data = trainingSet, method = "class", x = TRUE, y = TRUE, @@ -101,6 +103,10 @@ mlClassificationDecisionTree <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") cps <- seq(0, options[["maxComplexityParameter"]], by = 0.01) accuracyStore <- trainAccuracyStore <- numeric(length(cps)) startProgressbar(length(cps)) diff --git a/R/mlClassificationKnn.R b/R/mlClassificationKnn.R index 8d6e0268..b3b4a3d4 100644 --- a/R/mlClassificationKnn.R +++ b/R/mlClassificationKnn.R @@ -89,6 +89,8 @@ mlClassificationKnn <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") testFit <- kknn::kknn( formula = formula, train = trainingSet, test = testSet, k = options[["noOfNearestNeighbours"]], distance = distance, kernel = weights, scale = FALSE @@ -100,6 +102,10 @@ mlClassificationKnn <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") if (options[["modelValid"]] == "validationManual") { nnRange <- 1:options[["maxNearestNeighbors"]] accuracyStore <- numeric(length(nnRange)) diff --git a/R/mlClassificationLda.R b/R/mlClassificationLda.R index 7a718e2a..5d1f27ca 100644 --- a/R/mlClassificationLda.R +++ b/R/mlClassificationLda.R @@ -112,6 +112,8 @@ mlClassificationLda <- function(jaspResults, dataset, options, ...) { } trainingSet <- dataset[trainingIndex, ] testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") # Create the generated test set indicator testIndicatorColumn <- rep(1, nrow(dataset)) testIndicatorColumn[trainingIndex] <- 0 diff --git a/R/mlClassificationLogisticMultinomial.R b/R/mlClassificationLogisticMultinomial.R index 465e7ed5..71f5af13 100644 --- a/R/mlClassificationLogisticMultinomial.R +++ b/R/mlClassificationLogisticMultinomial.R @@ -80,6 +80,8 @@ mlClassificationLogisticMultinomial <- function(jaspResults, dataset, options, . testIndicatorColumn[trainingIndex] <- 0 # Just create a train and a test set (no optimization) testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") # Create the formula if (options[["intercept"]]) { formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + "))) diff --git a/R/mlClassificationNaiveBayes.R b/R/mlClassificationNaiveBayes.R index 0ae613be..d2f7a9e0 100644 --- a/R/mlClassificationNaiveBayes.R +++ b/R/mlClassificationNaiveBayes.R @@ -81,6 +81,8 @@ mlClassificationNaiveBayes <- function(jaspResults, dataset, options, ...) { testIndicatorColumn[trainingIndex] <- 0 # Just create a train and a test set (no optimization) testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") trainingFit <- e1071::naiveBayes(formula, data = trainingSet, laplace = options[["smoothingParameter"]]) # Use the specified model to make predictions for dataset testPredictions <- predict(trainingFit, newdata = testSet, type = "class") diff --git a/R/mlClassificationNeuralNetwork.R b/R/mlClassificationNeuralNetwork.R index 053e9765..e228b857 100644 --- a/R/mlClassificationNeuralNetwork.R +++ b/R/mlClassificationNeuralNetwork.R @@ -95,6 +95,8 @@ mlClassificationNeuralNetwork <- function(jaspResults, dataset, options, ...) { if (options[["modelOptimization"]] == "manual") { trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") structure <- .getNeuralNetworkStructure(options) p <- try({ fit <- neuralnet::neuralnet( @@ -120,6 +122,10 @@ mlClassificationNeuralNetwork <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") accuracyStore <- numeric(options[["maxGenerations"]]) trainAccuracyStore <- numeric(options[["maxGenerations"]]) # For plotting diff --git a/R/mlClassificationRandomForest.R b/R/mlClassificationRandomForest.R index 81d67800..a9f6848c 100644 --- a/R/mlClassificationRandomForest.R +++ b/R/mlClassificationRandomForest.R @@ -92,6 +92,8 @@ mlClassificationRandomForest <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") testFit <- randomForest::randomForest( x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]], xtest = testSet[, options[["predictors"]]], ytest = testSet[, options[["target"]]], @@ -106,6 +108,10 @@ mlClassificationRandomForest <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") validationFit <- randomForest::randomForest( x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]], xtest = validationSet[, options[["predictors"]]], ytest = validationSet[, options[["target"]]], diff --git a/R/mlClassificationSvm.R b/R/mlClassificationSvm.R index dae6a85d..e576d85a 100644 --- a/R/mlClassificationSvm.R +++ b/R/mlClassificationSvm.R @@ -86,6 +86,8 @@ mlClassificationSvm <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") cost <- options[["cost"]] trainingFit <- e1071::svm( formula = formula, data = trainingSet, type = "C-classification", kernel = options[["weights"]], cost = cost, tolerance = options[["tolerance"]], @@ -97,6 +99,10 @@ mlClassificationSvm <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") costs <- seq(0.01, options[["maxCost"]], 0.01) accuracyStore <- trainAccuracyStore <- numeric(length(costs)) startProgressbar(length(costs)) diff --git a/R/mlPrediction.R b/R/mlPrediction.R index 6ca7d1a5..dfae59ac 100644 --- a/R/mlPrediction.R +++ b/R/mlPrediction.R @@ -221,7 +221,7 @@ is.jaspMachineLearning <- function(x) { # also define methods for other objects .mlPredictionReady <- function(model, dataset, options) { - if (!is.null(model)) { + if (!is.null(model) && !is.null(dataset)) { modelVars <- model[["jaspVars"]][["encoded"]]$predictors presentVars <- colnames(dataset) ready <- all(modelVars %in% presentVars) @@ -241,12 +241,25 @@ is.jaspMachineLearning <- function(x) { } .mlPredictionReadData <- function(dataset, options, model) { - dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]]) - if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) { - dataset <- .scaleNumericData(dataset) + if (length(options[["predictors"]]) == 0) { + dataset <- NULL + } else { + dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]]) + if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) { + dataset <- .scaleNumericData(dataset) + } + # Select only the predictors in the model to prevent accidental double column names + dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors)] + # Ensure the column names in the dataset match those in the training data + colnames(dataset) <- .matchDecodedNames(colnames(dataset), model) + # Retrieve the training set + trainingSet <- model[["explainer"]]$data + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, dataset, "prediction", model) + # Ensure factor variables in dataset have same levels as those in the training data + factorColumns <- colnames(dataset)[sapply(dataset, is.factor)] + dataset[factorColumns] <- lapply(factorColumns, function(i) factor(dataset[[i]], levels = levels(trainingSet[[i]]))) } - dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors)] # Filter only predictors to prevent accidental double column names - colnames(dataset) <- .matchDecodedNames(colnames(dataset), model) return(dataset) } diff --git a/R/mlRegressionBoosting.R b/R/mlRegressionBoosting.R index 23c845c5..38e3b459 100644 --- a/R/mlRegressionBoosting.R +++ b/R/mlRegressionBoosting.R @@ -86,6 +86,8 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") noOfFolds <- 0 .mlBoostingCheckMinObsNode(options, trainingSet) # Check for min obs in nodes trainingFit <- gbm::gbm( @@ -101,6 +103,10 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") if (options[["modelValid"]] == "validationManual") { noOfFolds <- 0 } else if (options[["modelValid"]] == "validationKFold") { diff --git a/R/mlRegressionDecisionTree.R b/R/mlRegressionDecisionTree.R index 4ba00e9f..31d10b45 100644 --- a/R/mlRegressionDecisionTree.R +++ b/R/mlRegressionDecisionTree.R @@ -77,6 +77,8 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") complexityPenalty <- options[["complexityParameter"]] trainingFit <- rpart::rpart( formula = formula, data = trainingSet, method = "anova", x = TRUE, y = TRUE, @@ -89,6 +91,10 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") cps <- seq(0, options[["maxComplexityParameter"]], by = 0.01) errorStore <- trainErrorStore <- numeric(length(cps)) startProgressbar(length(cps)) diff --git a/R/mlRegressionKnn.R b/R/mlRegressionKnn.R index 5312e14b..32828523 100644 --- a/R/mlRegressionKnn.R +++ b/R/mlRegressionKnn.R @@ -77,6 +77,8 @@ mlRegressionKnn <- function(jaspResults, dataset, options, state = NULL) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") testFit <- kknn::kknn( formula = formula, train = trainingSet, test = testSet, k = options[["noOfNearestNeighbours"]], distance = distance, kernel = weights, scale = FALSE @@ -88,6 +90,10 @@ mlRegressionKnn <- function(jaspResults, dataset, options, state = NULL) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") if (options[["modelValid"]] == "validationManual") { nnRange <- 1:options[["maxNearestNeighbors"]] errorStore <- numeric(length(nnRange)) diff --git a/R/mlRegressionLinear.R b/R/mlRegressionLinear.R index 71b844c0..49f40591 100644 --- a/R/mlRegressionLinear.R +++ b/R/mlRegressionLinear.R @@ -68,6 +68,8 @@ mlRegressionLinear <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") if (options[["intercept"]]) { formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + "))) } else { diff --git a/R/mlRegressionNeuralNetwork.R b/R/mlRegressionNeuralNetwork.R index b2d3c0d6..906d9ddd 100644 --- a/R/mlRegressionNeuralNetwork.R +++ b/R/mlRegressionNeuralNetwork.R @@ -140,6 +140,8 @@ mlRegressionNeuralNetwork <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") # Structure of neural network structure <- .getNeuralNetworkStructure(options) p <- try({ @@ -167,6 +169,10 @@ mlRegressionNeuralNetwork <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] valid <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, valid, "validation") errorStore <- numeric(options[["maxGenerations"]]) trainErrorStore <- numeric(options[["maxGenerations"]]) # For plotting diff --git a/R/mlRegressionRandomForest.R b/R/mlRegressionRandomForest.R index 7f02eaba..f599455d 100644 --- a/R/mlRegressionRandomForest.R +++ b/R/mlRegressionRandomForest.R @@ -80,6 +80,8 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") testFit <- randomForest::randomForest( x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]], xtest = testSet[, options[["predictors"]]], ytest = testSet[, options[["target"]]], @@ -94,6 +96,10 @@ mlRegressionRandomForest <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") validationFit <- randomForest::randomForest( x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]], xtest = validationSet[, options[["predictors"]]], ytest = validationSet[, options[["target"]]], diff --git a/R/mlRegressionRegularized.R b/R/mlRegressionRegularized.R index b02575ec..2c9f0b61 100644 --- a/R/mlRegressionRegularized.R +++ b/R/mlRegressionRegularized.R @@ -87,6 +87,8 @@ mlRegressionRegularized <- function(jaspResults, dataset, options, ...) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") trainingFit <- glmnet::cv.glmnet( x = as.matrix(trainingSet[, options[["predictors"]]]), y = trainingSet[, options[["target"]]], nfolds = 10, type.measure = "deviance", @@ -106,6 +108,10 @@ mlRegressionRegularized <- function(jaspResults, dataset, options, ...) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") trainingWeights <- weights[trainingIndex] trainingFit <- glmnet::cv.glmnet( x = as.matrix(trainingSet[, options[["predictors"]]]), y = trainingSet[, options[["target"]]], diff --git a/R/mlRegressionSvm.R b/R/mlRegressionSvm.R index 19e5480a..7a634524 100644 --- a/R/mlRegressionSvm.R +++ b/R/mlRegressionSvm.R @@ -74,6 +74,8 @@ mlRegressionSvm <- function(jaspResults, dataset, options, state = NULL) { # Just create a train and a test set (no optimization) trainingSet <- trainingAndValidationSet testSet <- dataset[-trainingIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") cost <- options[["cost"]] trainingFit <- e1071::svm( formula = formula, data = trainingSet, type = "eps-regression", kernel = options[["weights"]], cost = cost, tolerance = options[["tolerance"]], @@ -85,6 +87,10 @@ mlRegressionSvm <- function(jaspResults, dataset, options, state = NULL) { testSet <- dataset[-trainingIndex, ] validationSet <- trainingAndValidationSet[validationIndex, ] trainingSet <- trainingAndValidationSet[-validationIndex, ] + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test") + # Check for factor levels in the validation set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation") costs <- seq(0.01, options[["maxCost"]], 0.01) errorStore <- trainErrorStore <- numeric(length(costs)) startProgressbar(length(costs)) diff --git a/inst/qml/common/ui/DataSplit.qml b/inst/qml/common/ui/DataSplit.qml index 80a7f9a8..4f65ddcf 100644 --- a/inst/qml/common/ui/DataSplit.qml +++ b/inst/qml/common/ui/DataSplit.qml @@ -84,7 +84,7 @@ Section name: "testSetIndicator" label: qsTr("Test set indicator") childrenOnSameRow: true - info: qsTr("Use an indicator variable to select data for the test set. This indicator should be a column in your data that consists of only 0 (excluded from the test set) and 1 (included in the test set). The data will then be split into a training (and validation if requested) set (0), and a test set (1) according to your indicator.") + info: qsTr("Use an indicator variable to select data for the test set. This indicator should be a column of type scale in your data that consists of only 0 (excluded from the test set) and 1 (included in the test set). The data will then be split into a training (and validation if requested) set (0), and a test set (1) according to your indicator.") DropDown { @@ -94,6 +94,7 @@ Section addEmptyValue: true placeholderText: qsTr("None") info: qsTr("The variable in the data set that is used as the test set indicator.") + allowedColumns: "scale" } } }