Start scaling

koenderks · koenderks · commit e5a305827ef2 · 2025-04-18T15:42:13.000+02:00
diff --git a/R/commonMachineLearningRegression.R b/R/commonMachineLearningRegression.R
@@ -501,6 +501,8 @@
     model[["jaspVars"]] <- list()
     model[["jaspVars"]]$decoded <- list(target = decodeColNames(options[["target"]]), predictors = decodeColNames(options[["predictors"]]))
     model[["jaspVars"]]$encoded = list(target = options[["target"]], predictors = options[["predictors"]])
+    model[["jaspScaling"]] <- .getJaspScaling(dataset[, options[["predictors"]], drop = FALSE])
+	print(model[["jaspScaling"]])
     model[["jaspVersion"]] <- .baseCitation
     model[["explainer"]] <- regressionResult[["explainer"]]
     class(model) <- c(class(regressionResult[["model"]]), "jaspRegression", "jaspMachineLearning")
@@ -683,6 +685,36 @@
   }
 }
 
+.getJaspScaling <- function(x) {
+  idx <- sapply(x, function(x) is.numeric(x) && length(unique(x)) > 1)
+  colNames <- list(encoded = colnames(x), decoded = decodeColNames(colnames(x)))
+  cols_to_scale <- colNames[["decoded"]][idx]
+  centers <- setNames(numeric(length(cols_to_scale)), cols_to_scale)
+  scales <- setNames(numeric(length(cols_to_scale)), cols_to_scale)
+  for (col in cols_to_scale) {
+    encodedColName <- colNames[["encoded"]][which(colNames[["decoded"]] == col)]
+    centers[col] <- mean(x[[encodedColName]])
+    scales[col] <- sd(x[[encodedColName]])
+  }
+  return(list(centers, scaling))
+}
+
+.setJaspScaling <- function(x, center, scale) {
+  if (nrow(x) == 0) {
+    return(x)
+  }
+  idx <- sapply(x, function(x) is.numeric(x) && length(unique(x)) > 1)
+  colNames <- list(encoded = colnames(x), decoded = decodeColNames(colnames(x)))
+  cols_to_scale <- colNames[["decoded"]][idx]
+  for (col in cols_to_scale) {
+    encodedColName <- colNames[["encoded"]][which(colNames[["decoded"]] == col)]
+    x[, encodedColName] <- scale(x[, encodedColName, drop = FALSE], center = center[col], scale = scale[col])
+  }
+  attr(x, which = "scaled:center") <- NULL
+  attr(x, which = "scaled:scale") <- NULL
+  return(x)
+}
+
 # these could also extend the S3 method scale although that could be somewhat unexpected
 .scaleNumericData <- function(x, ...) {
   UseMethod(".scaleNumericData", x)
diff --git a/R/mlPrediction.R b/R/mlPrediction.R
@@ -274,13 +274,18 @@ is.jaspMachineLearning <- function(x) {
     dataset <- NULL
   } else {
     dataset <- jaspBase::excludeNaListwise(dataset, options[["predictors"]])
-    if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) {
-      dataset <- .scaleNumericData(dataset)
-    }
     # Select only the predictors in the model to prevent accidental double column names
     dataset <- dataset[, which(decodeColNames(colnames(dataset)) %in% model[["jaspVars"]][["decoded"]]$predictors), drop = FALSE]
     # Ensure the column names in the dataset match those in the training data
     colnames(dataset) <- .matchDecodedNames(colnames(dataset), model)
+    # Scale the features with the same scaling as the origingal dataset
+    if (options[["scaleVariables"]] && length(unlist(options[["predictors"]])) > 0) {
+      if (is.null(model[["jaspScaling"]])) {
+        dataset <- .scaleNumericData(dataset)
+      } else {
+        dataset <- .setJaspScaling(dataset, model$jaspScaling[[1]], model$jaspScaling[[2]])
+      }
+    }
     # Retrieve the training set
     trainingSet <- model[["explainer"]]$data
     # Check for factor levels in the test set that are not in the training set