diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R index 26752790..daafcc1b 100644 --- a/R/commonMachineLearningClassification.R +++ b/R/commonMachineLearningClassification.R @@ -340,6 +340,7 @@ model[["jaspVars"]] <- list() model[["jaspVars"]]$decoded <- list(target = decodeColNames(options[["target"]]), predictors = decodeColNames(options[["predictors"]])) model[["jaspVars"]]$encoded = list(target = options[["target"]], predictors = options[["predictors"]]) + model[["jaspScaling"]] <- attr(dataset, "jaspScaling") model[["jaspVersion"]] <- .baseCitation model[["explainer"]] <- classificationResult[["explainer"]] class(model) <- c(class(classificationResult[["model"]]), "jaspClassification", "jaspMachineLearning") diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R index 99005bcd..61fd1596 100644 --- a/R/commonMachineLearningRegression.R +++ b/R/commonMachineLearningRegression.R @@ -70,6 +70,7 @@ # Scale numeric predictors if (length(unlist(options[["predictors"]])) > 0 && options[["scaleVariables"]]) { + attr(dataset, which = "jaspScaling") <- .getJaspScaling(dataset[, options[["predictors"]], drop = FALSE]) dataset[, options[["predictors"]]] <- .scaleNumericData(dataset[, options[["predictors"]], drop = FALSE]) } return(dataset) @@ -501,6 +502,7 @@ model[["jaspVars"]] <- list() model[["jaspVars"]]$decoded <- list(target = decodeColNames(options[["target"]]), predictors = decodeColNames(options[["predictors"]])) model[["jaspVars"]]$encoded = list(target = options[["target"]], predictors = options[["predictors"]]) + model[["jaspScaling"]] <- attr(dataset, "jaspScaling") model[["jaspVersion"]] <- .baseCitation model[["explainer"]] <- regressionResult[["explainer"]] class(model) <- c(class(regressionResult[["model"]]), "jaspRegression", "jaspMachineLearning") @@ -683,6 +685,14 @@ } } +.getJaspScaling <- function(x) { + idx <- sapply(x, function(x) is.numeric(x) && length(unique(x)) > 1) + cols_to_scale <- colnames(x)[idx] + centers <- sapply(x[cols_to_scale], mean) + scales <- sapply(x[cols_to_scale], sd) + return(list(centers, scales)) +} + # these could also extend the S3 method scale although that could be somewhat unexpected .scaleNumericData <- function(x, ...) { UseMethod(".scaleNumericData", x) diff --git a/R/mlPrediction.R b/R/mlPrediction.R index d32786a2..180843c4 100644 --- a/R/mlPrediction.R +++ b/R/mlPrediction.R @@ -269,25 +269,38 @@ is.jaspMachineLearning <- function(x) { return(names) } +.setJaspScaling <- function(x, centers, scales) { + if (nrow(x) == 0) { + return(x) + } + for (col in names(centers)) { + x[, col] <- (x[, col] - centers[col]) / scales[col] + } + return(x) +} + .mlPredictionReadData <- function(dataset, options, model) { if (length(options[["predictors"]]) == 0) { dataset <- NULL } else { dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]]) - if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) { - dataset <- .scaleNumericData(dataset) - } # Select only the predictors in the model to prevent accidental double column names dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors), drop = FALSE] - # Ensure the column names in the dataset match those in the training data - colnames(dataset) <- .matchDecodedNames(colnames(dataset), model) - # Retrieve the training set - trainingSet <- model[["explainer"]]$data - # Check for factor levels in the test set that are not in the training set - .checkForNewFactorLevelsInPredictionSet(trainingSet, dataset, "prediction", model) - # Ensure factor variables in dataset have same levels as those in the training data - factorColumns <- colnames(dataset)[sapply(dataset, is.factor)] - dataset[factorColumns] <- lapply(factorColumns, function(i) factor(dataset[[i]], levels = levels(trainingSet[[i]]))) + if (NCOL(dataset) == length(model[["jaspVars"]][["decoded"]]$predictors)) { + # Ensure the column names in the dataset match those in the training data + colnames(dataset) <- .matchDecodedNames(colnames(dataset), model) + # Scale the features with the same scaling as the original dataset + if (!is.null(model[["jaspScaling"]])) { + dataset <- .setJaspScaling(dataset, model$jaspScaling[[1]], model$jaspScaling[[2]]) + } + # Retrieve the training set + trainingSet <- model[["explainer"]]$data + # Check for factor levels in the test set that are not in the training set + .checkForNewFactorLevelsInPredictionSet(trainingSet, dataset, "prediction", model) + # Ensure that factor variables in the dataset have their levels ordered the same way as in the training data + factorColumns <- colnames(dataset)[sapply(dataset, is.factor)] + dataset[factorColumns] <- lapply(factorColumns, function(i) factor(dataset[[i]], levels = levels(trainingSet[[i]]))) + } } return(dataset) } @@ -299,7 +312,7 @@ is.jaspMachineLearning <- function(x) { if (ready) { dataset <- dataset[which(colnames(dataset) %in% model[["jaspVars"]][["encoded"]]$predictors)] jaspResults[["predictions"]] <- createJaspState(.mlPredictionGetPredictions(model, dataset)) - jaspResults[["predictions"]]$dependOn(options = c("loadPath", "predictors", "scaleVariables")) + jaspResults[["predictions"]]$dependOn(options = c("loadPath", "predictors")) return(jaspResults[["predictions"]]$object) } else { return(NULL) @@ -326,6 +339,11 @@ is.jaspMachineLearning <- function(x) { if (is.null(model)) { return() } + if (is.null(model[["jaspScaling"]])) { + table$addFootnote(gettext("The features in the new data are unscaled, consistent with the training set.")) + } else { + table$addFootnote(gettext("The features in the new data are scaled the same as those in the training set.")) + } modelVars_encoded <- model[["jaspVars"]][["encoded"]]$predictors modelVars_decoded <- model[["jaspVars"]][["decoded"]]$predictors presentVars_encoded <- colnames(dataset) @@ -389,7 +407,7 @@ is.jaspMachineLearning <- function(x) { return() } table <- createJaspTable(gettext("Predictions for New Data")) - table$dependOn(options = c("predictors", "trainedModelFilePath", "predictionsTable", "predictionsTableFeatures", "scaleVariables", "fromIndex", "toIndex")) + table$dependOn(options = c("predictors", "trainedModelFilePath", "predictionsTable", "predictionsTableFeatures", "fromIndex", "toIndex")) table$position <- position table$addColumnInfo(name = "row", title = gettext("Case"), type = "integer") if (!is.null(model) && inherits(model, "jaspClassification")) { @@ -433,7 +451,7 @@ is.jaspMachineLearning <- function(x) { predictionsColumn <- rep(NA, max(as.numeric(rownames(dataset)))) predictionsColumn[as.numeric(rownames(dataset))] <- predictions[[1]] jaspResults[["predictionsColumn"]] <- createJaspColumn(columnName = options[["predictionsColumn"]]) - jaspResults[["predictionsColumn"]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "scaleVariables", "addPredictions")) + jaspResults[["predictionsColumn"]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "addPredictions")) if (inherits(model, "jaspClassification")) jaspResults[["predictionsColumn"]]$setNominal(predictionsColumn) if (inherits(model, "jaspRegression")) jaspResults[["predictionsColumn"]]$setScale(predictionsColumn) } @@ -446,7 +464,7 @@ is.jaspMachineLearning <- function(x) { break } jaspResults[[colName]] <- createJaspColumn(columnName = colName) - jaspResults[[colName]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "scaleVariables", "addPredictions", "addProbabilities")) + jaspResults[[colName]]$dependOn(options = c("predictionsColumn", "predictors", "trainedModelFilePath", "addPredictions", "addProbabilities")) jaspResults[[colName]]$setScale(predictions[[2]][, i]) } } diff --git a/inst/qml/mlPrediction.qml b/inst/qml/mlPrediction.qml index 536b7c04..d6fc61e3 100644 --- a/inst/qml/mlPrediction.qml +++ b/inst/qml/mlPrediction.qml @@ -57,13 +57,6 @@ Form } } - Group - { - title: qsTr("Algorithmic Settings") - - UI.ScaleVariables { } - } - Group { title: qsTr("Tables")