First implementation

koenderks · koenderks · commit a83a2796ede9 · 2024-09-09T22:50:29.000+02:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,7 +41,8 @@ Imports:
   rpart (>= 4.1.16),
   ROCR,
   Rtsne,
-  signal
+  signal,
+  VGAM
 Suggests:
   testthat
 Remotes:
diff --git a/NAMESPACE b/NAMESPACE
@@ -51,6 +51,7 @@ export(mlClassificationBoosting)
 export(mlClassificationDecisionTree)
 export(mlClassificationKnn)
 export(mlClassificationLda)
+export(mlClassificationLogistic)
 export(mlClassificationNaiveBayes)
 export(mlClassificationNeuralNetwork)
 export(mlClassificationRandomForest)
diff --git a/R/commonMachineLearningClassification.R b/R/commonMachineLearningClassification.R
@@ -62,7 +62,7 @@
   if (type == "lda" || type == "randomForest" || type == "boosting") {
     # Require at least 2 features
     ready <- length(options[["predictors"]][options[["predictors"]] != ""]) >= 2 && options[["target"]] != ""
-  } else if (type == "knn" || type == "neuralnet" || type == "rpart" || type == "svm" || type == "naivebayes") {
+  } else if (type == "knn" || type == "neuralnet" || type == "rpart" || type == "svm" || type == "naivebayes" || type == "logistic") {
     # Require at least 1 features
     ready <- length(options[["predictors"]][options[["predictors"]] != ""]) >= 1 && options[["target"]] != ""
   }
@@ -93,7 +93,8 @@
         "neuralnet" = .neuralnetClassification(dataset, options, jaspResults),
         "rpart" = .decisionTreeClassification(dataset, options, jaspResults),
         "svm" = .svmClassification(dataset, options, jaspResults),
-        "naivebayes" = .naiveBayesClassification(dataset, options, jaspResults)
+        "naivebayes" = .naiveBayesClassification(dataset, options, jaspResults),
+        "logistic" = .logisticRegressionClassification(dataset, options, jaspResults)
       )
     })
     if (isTryError(p)) { # Fail gracefully
@@ -116,7 +117,8 @@
     "neuralnet" = gettext("Neural Network Classification"),
     "rpart" = gettext("Decision Tree Classification"),
     "svm" = gettext("Support Vector Machine Classification"),
-    "naivebayes" = gettext("Naive Bayes Classification")
+    "naivebayes" = gettext("Naive Bayes Classification"),
+    "logistic" = gettext("Logistic / Multinomial Regression")
   )
   tableTitle <- gettextf("Model Summary: %1$s", title)
   table <- createJaspTable(tableTitle)
@@ -147,6 +149,8 @@
     table$addColumnInfo(name = "vectors", title = gettext("Support Vectors"), type = "integer")
   } else if (type == "naivebayes") {
     table$addColumnInfo(name = "smoothing", title = gettext("Smoothing"), type = "number")
+  } else if (type == "logistic") {
+    table$addColumnInfo(name = "family", title = gettext("Family"), type = "string")
   }
   # Add common columns
   table$addColumnInfo(name = "nTrain", title = gettext("n(Train)"), type = "integer")
@@ -164,7 +168,7 @@
   }
   # If no analysis is run, specify the required variables in a footnote
   if (!ready) {
-    table$addFootnote(gettextf("Please provide a target variable and at least %i feature variable(s).", if (type == "knn" || type == "neuralnet" || type == "rpart" || type == "svm") 1L else 2L))
+    table$addFootnote(gettextf("Please provide a target variable and at least %i feature variable(s).", if (type == "knn" || type == "neuralnet" || type == "rpart" || type == "svm" || type == "logistic") 1L else 2L))
   }
   if (options[["savePath"]] != "") {
     validNames <- (length(grep(" ", decodeColNames(colnames(dataset)))) == 0) && (length(grep("_", decodeColNames(colnames(dataset)))) == 0)
@@ -312,6 +316,14 @@
       testAcc = classificationResult[["testAcc"]]
     )
     table$addRows(row)
+  } else if (type == "logistic") {
+    row <- data.frame(
+      family = classificationResult[["family"]],
+      nTrain = nTrain,
+      nTest = classificationResult[["ntest"]],
+      testAcc = classificationResult[["testAcc"]]
+    )
+    table$addRows(row)
   }
   # Save the applied model if requested
   if (options[["saveModel"]] && options[["savePath"]] != "") {
diff --git a/R/mlClassificationLogistic.R b/R/mlClassificationLogistic.R
@@ -0,0 +1,132 @@
+#
+# Copyright (C) 2013-2021 University of Amsterdam
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+mlClassificationLogistic <- function(jaspResults, dataset, options, ...) {
+
+  # Preparatory work
+  dataset <- .mlClassificationReadData(dataset, options)
+  .mlClassificationErrorHandling(dataset, options, type = "logistic")
+
+  # Check if analysis is ready to run
+  ready <- .mlClassificationReady(options, type = "logistic")
+
+  # Compute results and create the model summary table
+  .mlClassificationTableSummary(dataset, options, jaspResults, ready, position = 1, type = "logistic")
+
+  # If the user wants to add the classes to the data set
+  .mlClassificationAddPredictionsToData(dataset, options, jaspResults, ready)
+
+  # Add test set indicator to data
+  .mlAddTestIndicatorToData(options, jaspResults, ready, purpose = "classification")
+
+  # Create the data split plot
+  .mlPlotDataSplit(dataset, options, jaspResults, ready, position = 2, purpose = "classification", type = "logistic")
+
+  # Create the confusion table
+  .mlClassificationTableConfusion(dataset, options, jaspResults, ready, position = 3)
+
+  # Create the class proportions table
+  .mlClassificationTableProportions(dataset, options, jaspResults, ready, position = 4)
+
+  # Create the validation measures table
+  .mlClassificationTableMetrics(dataset, options, jaspResults, ready, position = 5)
+
+#   # Create the variable importance table
+#   .mlTableFeatureImportance(options, jaspResults, ready, position = 6, purpose = "classification")
+
+#   # Create the shap table
+#   .mlTableShap(dataset, options, jaspResults, ready, position = 7, purpose = "classification")
+
+#   # Create the ROC curve
+#   .mlClassificationPlotRoc(dataset, options, jaspResults, ready, position = 8, type = "logistic")
+
+  # Create the Andrews curves
+  .mlClassificationPlotAndrews(dataset, options, jaspResults, ready, position = 9)
+
+#   # Decision boundaries
+#   .mlClassificationPlotBoundaries(dataset, options, jaspResults, ready, position = 10, type = "logistic")
+}
+
+.logisticRegressionClassification <- function(dataset, options, jaspResults, ready) {
+  # Import model formula from jaspResults
+  formula <- jaspResults[["formula"]]$object
+  # Split the data into training and test sets
+  if (options[["holdoutData"]] == "testSetIndicator" && options[["testSetIndicatorVariable"]] != "") {
+    # Select observations according to a user-specified indicator (included when indicator = 1)
+    trainingIndex <- which(dataset[, options[["testSetIndicatorVariable"]]] == 0)
+  } else {
+    # Sample a percentage of the total data set
+    trainingIndex <- sample.int(nrow(dataset), size = ceiling((1 - options[["testDataManual"]]) * nrow(dataset)))
+  }
+  trainingSet <- dataset[trainingIndex, ]
+  # Create the generated test set indicator
+  testIndicatorColumn <- rep(1, nrow(dataset))
+  testIndicatorColumn[trainingIndex] <- 0
+  # Just create a train and a test set (no optimization)
+  testSet <- dataset[-trainingIndex, ]
+  if (nlevels(trainingSet[[options[["target"]]]]) == 2) {
+    family = "binomial"
+    trainingFit <- stats::glm(formula, data = trainingSet, family = family)
+    # Use the specified model to make predictions for dataset
+    testPredictions <- levels(trainingSet[[options[["target"]]]])[round(predict(trainingFit, newdata = testSet, type = "response"), 0) + 1]
+    dataPredictions <- levels(trainingSet[[options[["target"]]]])[round(predict(trainingFit, newdata = dataset, type = "response"), 0) + 1]
+  } else {
+    family <- "multinomial"
+    trainingFit <- VGAM::vglm(formula, data = trainingSet, family = family)
+    # Use the specified model to make predictions for dataset
+    testPredictions <- .mlClassificationMultinomialPredictions(trainingSet, options, predict(trainingFit, newdata = testSet))
+    dataPredictions <- .mlClassificationMultinomialPredictions(trainingSet, options, predict(trainingFit, newdata = dataset))
+  }
+  # Create results object
+  result <- list()
+  result[["formula"]] <- formula
+  result[["family"]] <- family
+  result[["model"]] <- trainingFit
+  result[["confTable"]] <- table("Pred" = testPredictions, "Real" = testSet[, options[["target"]]])
+  result[["testAcc"]] <- sum(diag(prop.table(result[["confTable"]])))
+#   result[["auc"]] <- .classificationCalcAUC(testSet, trainingSet, options, "logisticClassification")
+  result[["ntrain"]] <- nrow(trainingSet)
+  result[["ntest"]] <- nrow(testSet)
+  result[["testReal"]] <- testSet[, options[["target"]]]
+  result[["testPred"]] <- testPredictions
+  result[["train"]] <- trainingSet
+  result[["test"]] <- testSet
+  result[["testIndicatorColumn"]] <- testIndicatorColumn
+  result[["classes"]] <- dataPredictions
+#   result[["explainer"]] <- DALEX::explain(result[["model"]], type = "classification", data = result[["train"]], y = result[["train"]][, options[["target"]]], predict_function = function(model, data) predict(model, newdata = data, type = "raw"))
+#   if (nlevels(result[["testReal"]]) == 2) {
+#     result[["explainer_fi"]] <- DALEX::explain(result[["model"]], type = "classification", data = result[["train"]], y = as.numeric(result[["train"]][, options[["target"]]]) - 1, predict_function = function(model, data) predict(model, newdata = data, type = "class"))
+#   } else {
+#     result[["explainer_fi"]] <- DALEX::explain(result[["model"]], type = "multiclass", data = result[["train"]], y = result[["train"]][, options[["target"]]] , predict_function = function(model, data) predict(model, newdata = data, type = "raw"))
+#   }
+  return(result)
+}
+
+.mlClassificationMultinomialPredictions <- function(trainingSet, options, predictions) {
+  num_categories <- ncol(predictions) + 1
+  probs <- matrix(0, nrow = nrow(predictions), ncol = num_categories)
+  for (i in 1:(num_categories - 1)) {
+    probs[, i] <- exp(predictions[, i])
+  }
+  probs[, num_categories] <- 1
+  row_sums <- rowSums(probs)
+  probs <- probs / row_sums
+  predicted_category <- apply(probs, 1, which.max)
+  categories <- levels(trainingSet[[options[["target"]]]])
+  predicted_categories <- categories[predicted_category]
+  return(predicted_categories)
+}
diff --git a/inst/Description.qml b/inst/Description.qml
@@ -98,6 +98,12 @@ Description
 		func: 	"mlClassificationLda"
 	}
 	Analysis
+	{
+		menu:	qsTr("Logistic / Multinomial")
+		title:	qsTr("Logistic / Multinomial Classification")
+		func: 	"mlClassificationLogistic"
+	}
+	Analysis
 	{
 		menu:	qsTr("Naive Bayes")
 		title:	qsTr("Naive Bayes Classification")
diff --git a/inst/qml/mlClassificationLogistic.qml b/inst/qml/mlClassificationLogistic.qml
@@ -0,0 +1,82 @@
+//
+// Copyright (C) 2013-2021 University of Amsterdam
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public
+// License along with this program.  If not, see
+// <http://www.gnu.org/licenses/>.
+//
+
+import QtQuick			2.8
+import QtQuick.Layouts	1.3
+import JASP.Controls	1.0
+import JASP.Widgets		1.0
+
+import "./common/ui" as UI
+import "./common/tables" as TAB
+import "./common/figures" as FIG
+
+Form
+{
+	info: qsTr("Logistic regression.")
+
+	UI.VariablesFormClassification { id: vars }
+
+	Group
+	{
+		title: qsTr("Tables")
+
+		TAB.ConfusionMatrix { }
+		TAB.ClassProportions { }
+		TAB.ModelPerformance { }
+		TAB.FeatureImportance { }
+		TAB.ExplainPredictions { }
+	}
+
+	Group
+	{
+		title: qsTr("Plots")
+
+		FIG.DataSplit { }
+		FIG.RocCurve { }
+		FIG.AndrewsCurve { }
+		FIG.DecisionBoundary { }
+	}
+
+	UI.ExportResults { enabled: vars.predictorCount > 0 && vars.targetCount > 0 }
+	UI.DataSplit { trainingValidationSplit: false }
+
+	Section
+	{
+		title: qsTr("Training Parameters")
+
+		Group
+		{
+			title: qsTr("Algorithmic Settings")
+			
+			UI.ScaleVariables { }
+			UI.SetSeed { }
+		}
+
+		RadioButtonGroup
+		{
+			name:			"modelOptimization"
+			visible:		false
+
+			RadioButton
+			{
+				name:		"manual"
+				checked:	true
+			}
+		}
+	}
+}
diff --git a/tests/testthat/test-mlclassificationlogistic.R b/tests/testthat/test-mlclassificationlogistic.R
@@ -0,0 +1,66 @@
+context("Machine Learning Logistic Regression Classification")
+
+# Test fixed model #############################################################
+options <- initMlOptions("mlClassificationLogistic")
+options$addIndicator <- FALSE
+options$addPredictions <- FALSE
+options$classProportionsTable <- TRUE
+options$holdoutData <- "holdoutManual"
+options$modelOptimization <- "manual"
+options$modelValid <- "validationManual"
+options$predictionsColumn <- ""
+options$predictors <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
+options$predictors.types <- rep("scale", 4)
+options$saveModel <- FALSE
+options$savePath <- ""
+options$setSeed <- TRUE
+options$target <- "Species"
+options$target.types <- "nominal"
+options$testDataManual <- 0.2
+options$testIndicatorColumn <- ""
+options$testSetIndicatorVariable <- ""
+options$validationDataManual <- 0.2
+options$validationMeasures <- TRUE
+options$tableShap <- TRUE
+options$fromIndex <- 1
+options$toIndex <- 5
+options$featureImportanceTable <- TRUE
+set.seed(1)
+results <- jaspTools::runAnalysis("mlClassificationLogistic", "iris.csv", options)
+
+test_that("Class Proportions table results match", {
+	table <- results[["results"]][["classProportionsTable"]][["data"]]
+	jaspTools::expect_equal_tables(table,
+		list(0.333333333333333, "setosa", 0.333333333333333, 0.333333333333333,
+			 0.333333333333333, "versicolor", 0.266666666666667, 0.35, 0.333333333333333,
+			 "virginica", 0.4, 0.316666666666667))
+})
+
+test_that("Model Summary: Logistic / Multinomial Regression table results match", {
+	table <- results[["results"]][["classificationTable"]][["data"]]
+	jaspTools::expect_equal_tables(table,
+		list("multinomial", 30, 120, 1))
+})
+
+test_that("Confusion Matrix table results match", {
+	table <- results[["results"]][["confusionTable"]][["data"]]
+	jaspTools::expect_equal_tables(table,
+		list("Observed", "setosa", 10, 0, 0, "", "versicolor", 0, 8, 0, "",
+			 "virginica", 0, 0, 12))
+})
+
+test_that("Data Split plot matches", {
+	plotName <- results[["results"]][["plotDataSplit"]][["data"]]
+	testPlot <- results[["state"]][["figures"]][[plotName]][["obj"]]
+	jaspTools::expect_equal_plots(testPlot, "data-split")
+})
+
+test_that("Model Performance Metrics table results match", {
+	table <- results[["results"]][["validationMeasures"]][["data"]]
+	jaspTools::expect_equal_tables(table,
+		list(1, "", 1, 0, 0, 0, 0, "setosa", 1, 1, 1, 1, 0.333333333333333,
+			 10, 1, "<unicode>", 1, 1, 0, 0, 0, 0, "versicolor", 1, 1, 1,
+			 1, 0.266666666666667, 8, 1, "<unicode>", 1, 1, 0, 0, 0, 0, "virginica",
+			 1, 1, 1, 1, 0.4, 12, 1, "<unicode>", 1, 1, 0, 0, 0, 0, "Average / Total",
+			 1, 1, 1, 1, 1, 30, 1, "<unicode>"))
+})

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,12 @@ Description`
`98`	`98`	`func: "mlClassificationLda"`
`99`	`99`	`}`
`100`	`100`	`Analysis`
	`101`	`+ {`
	`102`	`+ menu: qsTr("Logistic / Multinomial")`
	`103`	`+ title: qsTr("Logistic / Multinomial Classification")`
	`104`	`+ func: "mlClassificationLogistic"`
	`105`	`+ }`
	`106`	`+ Analysis`
`101`	`107`	`{`
`102`	`108`	`menu: qsTr("Naive Bayes")`
`103`	`109`	`title: qsTr("Naive Bayes Classification")`