Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions R/commonMachineLearningRegression.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,6 @@
if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) {
dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE])
}
# Make sure the test set indicator is numeric
if (options[["testSetIndicatorVariable"]] != "" && options[["holdoutData"]] == "testSetIndicator")
dataset[[options[["testSetIndicatorVariable"]]]] <- as.numeric(dataset[[options[["testSetIndicatorVariable"]]]])

return(dataset)
}

Expand Down Expand Up @@ -145,6 +141,40 @@
}
}

.checkForNewFactorLevelsInPredictionSet <- function(trainingSet, predictionSet, type, model = NULL) {
factorNames <- colnames(predictionSet)[sapply(predictionSet, is.factor)]
factorNames <- factorNames[which(factorNames %in% colnames(trainingSet))]
factorsWithNewLevels <- character()
missingLevelsInTrainingSet <- list()
for (i in seq_along(factorNames)) {
currentFactor <- factorNames[i]
factorLevelsInTrainingSet <- unique(trainingSet[[currentFactor]])
factorLevelsInPredictionSet <- unique(predictionSet[[currentFactor]])
missingLevelsIndex <- which(!(factorLevelsInPredictionSet %in% factorLevelsInTrainingSet))
if (length(missingLevelsIndex) > 0) {
if (type != "prediction") {
factorsWithNewLevels <- c(factorsWithNewLevels, currentFactor)
missingLevelsInTrainingSet[[currentFactor]] <- factorLevelsInPredictionSet[missingLevelsIndex]
} else {
currentFactor <- model[["jaspVars"]][["decoded"]]$predictors[which(model[["jaspVars"]][["encoded"]]$predictors == currentFactor)]
factorsWithNewLevels <- c(factorsWithNewLevels, currentFactor)
missingLevelsInTrainingSet[[currentFactor]] <- factorLevelsInPredictionSet[missingLevelsIndex]
}
}
}
if (length(factorsWithNewLevels) > 0) {
setType <- switch(type, "test" = gettext("test set"), "validation" = gettext("validation set"), "prediction" = gettext("new dataset"))
additionalMessage <- switch(type,
"test" = gettext(" or use a different test set (e.g., automatically by setting a different seed or manually by specifying the test set indicator)"),
"validation" = gettext(" or use a different validation set by setting a different seed"),
"prediction" = "")
factorMessage <- paste(sapply(factorsWithNewLevels, function(i) {
paste0("Factor: ", i, "; Levels: ", paste(missingLevelsInTrainingSet[[i]], collapse = ", "))
}), collapse = "\n")
jaspBase:::.quitAnalysis(gettextf("Some factors in the %1$s have levels that do not appear in the training set. Please remove the rows with the following levels from the dataset%2$s.\n\n%3$s", setType, additionalMessage, factorMessage))
}
}

.getCustomErrorChecksKnnBoosting <- function(dataset, options, type) {
if (!type %in% c("knn", "boosting")) {
return()
Expand Down
6 changes: 6 additions & 0 deletions R/mlClassificationBoosting.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ mlClassificationBoosting <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
noOfFolds <- 0
.mlBoostingCheckMinObsNode(options, trainingSet) # Check for min obs in nodes
fit <- gbm::gbm(
Expand All @@ -113,6 +115,10 @@ mlClassificationBoosting <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
if (options[["modelValid"]] == "validationManual") {
noOfFolds <- 0
} else if (options[["modelValid"]] == "validationKFold") {
Expand Down
6 changes: 6 additions & 0 deletions R/mlClassificationDecisionTree.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ mlClassificationDecisionTree <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
complexityPenalty <- options[["complexityParameter"]]
trainingFit <- rpart::rpart(
formula = formula, data = trainingSet, method = "class", x = TRUE, y = TRUE,
Expand All @@ -101,6 +103,10 @@ mlClassificationDecisionTree <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
cps <- seq(0, options[["maxComplexityParameter"]], by = 0.01)
accuracyStore <- trainAccuracyStore <- numeric(length(cps))
startProgressbar(length(cps))
Expand Down
6 changes: 6 additions & 0 deletions R/mlClassificationKnn.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ mlClassificationKnn <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
testFit <- kknn::kknn(
formula = formula, train = trainingSet, test = testSet, k = options[["noOfNearestNeighbours"]],
distance = distance, kernel = weights, scale = FALSE
Expand All @@ -100,6 +102,10 @@ mlClassificationKnn <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
if (options[["modelValid"]] == "validationManual") {
nnRange <- 1:options[["maxNearestNeighbors"]]
accuracyStore <- numeric(length(nnRange))
Expand Down
2 changes: 2 additions & 0 deletions R/mlClassificationLda.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ mlClassificationLda <- function(jaspResults, dataset, options, ...) {
}
trainingSet <- dataset[trainingIndex, ]
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Create the generated test set indicator
testIndicatorColumn <- rep(1, nrow(dataset))
testIndicatorColumn[trainingIndex] <- 0
Expand Down
2 changes: 2 additions & 0 deletions R/mlClassificationLogisticMultinomial.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ mlClassificationLogisticMultinomial <- function(jaspResults, dataset, options, .
testIndicatorColumn[trainingIndex] <- 0
# Just create a train and a test set (no optimization)
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Create the formula
if (options[["intercept"]]) {
formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + ")))
Expand Down
2 changes: 2 additions & 0 deletions R/mlClassificationNaiveBayes.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ mlClassificationNaiveBayes <- function(jaspResults, dataset, options, ...) {
testIndicatorColumn[trainingIndex] <- 0
# Just create a train and a test set (no optimization)
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
trainingFit <- e1071::naiveBayes(formula, data = trainingSet, laplace = options[["smoothingParameter"]])
# Use the specified model to make predictions for dataset
testPredictions <- predict(trainingFit, newdata = testSet, type = "class")
Expand Down
6 changes: 6 additions & 0 deletions R/mlClassificationNeuralNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ mlClassificationNeuralNetwork <- function(jaspResults, dataset, options, ...) {
if (options[["modelOptimization"]] == "manual") {
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
structure <- .getNeuralNetworkStructure(options)
p <- try({
fit <- neuralnet::neuralnet(
Expand All @@ -120,6 +122,10 @@ mlClassificationNeuralNetwork <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
accuracyStore <- numeric(options[["maxGenerations"]])
trainAccuracyStore <- numeric(options[["maxGenerations"]])
# For plotting
Expand Down
6 changes: 6 additions & 0 deletions R/mlClassificationRandomForest.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ mlClassificationRandomForest <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
testFit <- randomForest::randomForest(
x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]],
xtest = testSet[, options[["predictors"]]], ytest = testSet[, options[["target"]]],
Expand All @@ -106,6 +108,10 @@ mlClassificationRandomForest <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
validationFit <- randomForest::randomForest(
x = trainingSet[, options[["predictors"]]], y = trainingSet[, options[["target"]]],
xtest = validationSet[, options[["predictors"]]], ytest = validationSet[, options[["target"]]],
Expand Down
6 changes: 6 additions & 0 deletions R/mlClassificationSvm.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ mlClassificationSvm <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
cost <- options[["cost"]]
trainingFit <- e1071::svm(
formula = formula, data = trainingSet, type = "C-classification", kernel = options[["weights"]], cost = cost, tolerance = options[["tolerance"]],
Expand All @@ -97,6 +99,10 @@ mlClassificationSvm <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
costs <- seq(0.01, options[["maxCost"]], 0.01)
accuracyStore <- trainAccuracyStore <- numeric(length(costs))
startProgressbar(length(costs))
Expand Down
25 changes: 19 additions & 6 deletions R/mlPrediction.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ is.jaspMachineLearning <- function(x) {

# also define methods for other objects
.mlPredictionReady <- function(model, dataset, options) {
if (!is.null(model)) {
if (!is.null(model) && !is.null(dataset)) {
modelVars <- model[["jaspVars"]][["encoded"]]$predictors
presentVars <- colnames(dataset)
ready <- all(modelVars %in% presentVars)
Expand All @@ -241,12 +241,25 @@ is.jaspMachineLearning <- function(x) {
}

.mlPredictionReadData <- function(dataset, options, model) {
dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]])
if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) {
dataset <- .scaleNumericData(dataset)
if (length(options[["predictors"]]) == 0) {
dataset <- NULL
} else {
dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]])
if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) {
dataset <- .scaleNumericData(dataset)
}
# Select only the predictors in the model to prevent accidental double column names
dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors)]
# Ensure the column names in the dataset match those in the training data
colnames(dataset) <- .matchDecodedNames(colnames(dataset), model)
# Retrieve the training set
trainingSet <- model[["explainer"]]$data
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, dataset, "prediction", model)
# Ensure factor variables in dataset have same levels as those in the training data
factorColumns <- colnames(dataset)[sapply(dataset, is.factor)]
dataset[factorColumns] <- lapply(factorColumns, function(i) factor(dataset[[i]], levels = levels(trainingSet[[i]])))
}
dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors)] # Filter only predictors to prevent accidental double column names
colnames(dataset) <- .matchDecodedNames(colnames(dataset), model)
return(dataset)
}

Expand Down
6 changes: 6 additions & 0 deletions R/mlRegressionBoosting.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
noOfFolds <- 0
.mlBoostingCheckMinObsNode(options, trainingSet) # Check for min obs in nodes
trainingFit <- gbm::gbm(
Expand All @@ -101,6 +103,10 @@ mlRegressionBoosting <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
if (options[["modelValid"]] == "validationManual") {
noOfFolds <- 0
} else if (options[["modelValid"]] == "validationKFold") {
Expand Down
6 changes: 6 additions & 0 deletions R/mlRegressionDecisionTree.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
complexityPenalty <- options[["complexityParameter"]]
trainingFit <- rpart::rpart(
formula = formula, data = trainingSet, method = "anova", x = TRUE, y = TRUE,
Expand All @@ -89,6 +91,10 @@ mlRegressionDecisionTree <- function(jaspResults, dataset, options, state = NULL
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
cps <- seq(0, options[["maxComplexityParameter"]], by = 0.01)
errorStore <- trainErrorStore <- numeric(length(cps))
startProgressbar(length(cps))
Expand Down
6 changes: 6 additions & 0 deletions R/mlRegressionKnn.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ mlRegressionKnn <- function(jaspResults, dataset, options, state = NULL) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
testFit <- kknn::kknn(
formula = formula, train = trainingSet, test = testSet, k = options[["noOfNearestNeighbours"]],
distance = distance, kernel = weights, scale = FALSE
Expand All @@ -88,6 +90,10 @@ mlRegressionKnn <- function(jaspResults, dataset, options, state = NULL) {
testSet <- dataset[-trainingIndex, ]
validationSet <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, validationSet, "validation")
if (options[["modelValid"]] == "validationManual") {
nnRange <- 1:options[["maxNearestNeighbors"]]
errorStore <- numeric(length(nnRange))
Expand Down
2 changes: 2 additions & 0 deletions R/mlRegressionLinear.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ mlRegressionLinear <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
if (options[["intercept"]]) {
formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + ")))
} else {
Expand Down
6 changes: 6 additions & 0 deletions R/mlRegressionNeuralNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ mlRegressionNeuralNetwork <- function(jaspResults, dataset, options, ...) {
# Just create a train and a test set (no optimization)
trainingSet <- trainingAndValidationSet
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Structure of neural network
structure <- .getNeuralNetworkStructure(options)
p <- try({
Expand Down Expand Up @@ -167,6 +169,10 @@ mlRegressionNeuralNetwork <- function(jaspResults, dataset, options, ...) {
testSet <- dataset[-trainingIndex, ]
valid <- trainingAndValidationSet[validationIndex, ]
trainingSet <- trainingAndValidationSet[-validationIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Check for factor levels in the validation set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, valid, "validation")
errorStore <- numeric(options[["maxGenerations"]])
trainErrorStore <- numeric(options[["maxGenerations"]])
# For plotting
Expand Down
Loading
Loading