|
1 | 1 | # Author: Ahmed El-Gabbas |
2 | | -# Date: 2025-12-15 |
3 | | -# Based on: `babaknaimi/sdm/inst/methods/sdm/svm.R` |
| 2 | +# Date: 2025-12-18 |
4 | 3 | # |
5 | | -# - New method using e1071::svm backend with minimal changes. |
6 | | -# - Enable probability outputs for classification by setting |
7 | | -# `probability = TRUE` when not provided by the user. This matches original |
8 | | -# svm's probability behavior. |
9 | | -# - Predict function returns positive-class probability when available. |
10 | | -# - All other arguments are passed via ... without overriding defaults. |
| 4 | +# - Binary classification for PA/PB using e1071::svm with probability outputs. |
| 5 | +# - Tuning: Parsimonious grid with 5-fold CV via e1071::tune: |
| 6 | +# kernel = "radial" |
| 7 | +# cost ∈ {1, 5, 10} |
| 8 | +# gamma ∈ {0.01, 0.05, 0.1} |
| 9 | +# Best model (tune.out$best.model) is returned for prediction. |
| 10 | +# - Class weights: Inverse-prevalence weighting (capped at 20) to handle |
| 11 | +# imbalance: |
| 12 | +# n0 = count of class "0", n1 = count of class "1" |
| 13 | +# weights = c("0" = 1, "1" = min(n0/n1, 20)) if n0 >= n1 |
| 14 | +# c("0" = min(n1/n0, 20), "1" = 1) otherwise |
| 15 | +# - Prediction: predict(..., probability = TRUE); returns positive-class |
| 16 | +# probability: |
| 17 | +# * Assumes response encoded as 0/1 and present as the left-hand side of the formula. |
| 18 | +# * Predictors and their names in new data must match those used at training. |
| 19 | +# * Grid is intentionally small to remain fast and robust models |
11 | 20 |
|
12 | 21 | methodInfo <- list( |
13 | | - name = c("svm2", "SVM2", "svm_e1071"), |
| 22 | + name = c("svm3", "SVM3", "svm_e1071_3"), |
14 | 23 | packages = "e1071", |
15 | 24 | modelTypes = c("pa", "pb", "ab", "n"), |
16 | 25 | fitParams = list( |
17 | 26 | formula = "standard.formula", data = "sdmDataFrame", v = "sdmVariables"), |
18 | | - fitSettings = list(kernel = "radial", probability = TRUE), |
19 | | - |
| 27 | + fitSettings = list(kernel = "radial"), |
20 | 28 | fitFunction = function(formula, data, v, ...) { |
21 | | - x <- sdm:::.getData.sdmMatrix( |
22 | | - formula, data, normalize = TRUE, frame = v@varInfo$numeric, scale = FALSE) |
23 | | - y <- sdm:::.getData.sdmY(formula, data) |
24 | | - |
25 | | - # class.weights is used to counter severe class imbalance in PA/PB data. |
26 | | - # |
27 | | - # In presence–absence SDMs, absences (0) often vastly outnumber presences |
28 | | - # (1). Without weighting, the SVM’s loss is dominated by the majority class, |
29 | | - # leading to poor discrimination (e.g., predicting almost all 0s). |
30 | 29 |
|
31 | | - # Compute counts of absences (n0) and presences (n1) |
32 | | - n0 <- sum(y == 0, na.rm = TRUE) |
33 | | - n1 <- sum(y == 1, na.rm = TRUE) |
| 30 | + formula <- as.formula(deparse(formula), env = environment()) |
| 31 | + resp <- all.vars(formula)[1] |
| 32 | + data[, resp] <- factor(data[, resp], levels = c(0L, 1L)) |
34 | 33 |
|
35 | | - # Upper bound for weight |
| 34 | + # Upweight the minority class |
| 35 | + n0 <- sum(data[, resp] == "0") |
| 36 | + n1 <- sum(data[, resp] == "1") |
36 | 37 | max_weight <- 20 |
37 | | - |
38 | | - # - Upweight the minority class so its misclassification cost is comparable |
39 | | - # to the majority class, using a simple inverse-prevalence rule: |
40 | | - # minority_weight = n_majority / n_minority |
41 | | - # - Cap the weight by max_weight (here 20) to avoid numeric instability and |
42 | | - # overly aggressive rebalancing on extremely imbalanced folds. |
43 | | - # - If absences are the majority (n0 >= n1), set weight for class "1" |
44 | | - # (presence) to min(n0 / n1, max_weight), keep class "0" at 1. Otherwise, |
45 | | - # upweight class "0" similarly when presences are the majority. |
46 | | - # - Pass class.weights to e1071::svm so the optimization accounts for |
47 | | - # imbalance while leaving all other defaults unchanged. |
48 | 38 | if (n0 >= n1) { |
49 | | - # More absences |
50 | | - class.weights <- c("0" = 1, "1" = min(n0 / n1, max_weight)) |
| 39 | + class.weights <- setNames( |
| 40 | + c(1, min(n0 / max(1, n1), max_weight)), |
| 41 | + c("0", "1")) |
51 | 42 | } else { |
52 | | - # More presences |
53 | | - class.weights <- c("0" = min(n1 / n0, max_weight), "1" = 1) |
| 43 | + class.weights <- setNames( |
| 44 | + c(min(n1 / max(1, n0), max_weight), 1), |
| 45 | + c("0", "1")) |
54 | 46 | } |
55 | 47 |
|
56 | | - e1071::svm(x = x, y = y, scale = TRUE, class.weights = class.weights, ...) |
| 48 | + tune.out <- e1071::tune( |
| 49 | + e1071::svm, train.x = formula, data = data, kernel = "radial", |
| 50 | + ranges = list(cost = c(1, 5, 10), gamma = c(0.01, 0.05, 0.1)), |
| 51 | + class.weights = class.weights, probability = TRUE, |
| 52 | + tunecontrol = e1071::tune.control(cross = 5)) |
| 53 | + |
| 54 | + tune.out$best.model |
57 | 55 | }, |
58 | 56 | settingRules = NULL, |
59 | 57 | tuneParams = NULL, |
60 | 58 | predictParams = list( |
61 | 59 | object = "model", formula = "standard.formula", newx = "sdmDataFrame", |
62 | 60 | v = "sdmVariables"), |
63 | | - predictSettings = list(probability = TRUE), |
| 61 | + predictSettings = list(), |
64 | 62 | predictFunction = function(object, formula, newx, v, ...) { |
65 | | - newx <- sdm:::.getData.sdmMatrix( |
66 | | - formula, newx, normalize = TRUE, |
67 | | - frame = v@varInfo$numeric, scale = FALSE) |
68 | | - predict(object, newx, ...) |
| 63 | + pred_probs <- predict( |
| 64 | + object = object, newdata = newx, probability = TRUE, ...) |
| 65 | + attr(pred_probs, "probabilities")[, "1"] |
69 | 66 | } |
70 | 67 | ) |
0 commit comments