|
| 1 | +#' One-way ANOVA (from scratch) |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' Computes a one-way Analysis of Variance (ANOVA) to test whether k group means |
| 5 | +#' are equal. Implements sums of squares, F statistic, and p-value without using \code{aov()}. |
| 6 | +#' |
| 7 | +#' @param x Numeric vector of observations. |
| 8 | +#' @param g Factor or character vector of group labels (same length as \code{x}). |
| 9 | +#' |
| 10 | +#' @return A list with: |
| 11 | +#' \item{df_between}{Degrees of freedom between groups (k - 1).} |
| 12 | +#' \item{df_within}{Degrees of freedom within groups (n - k).} |
| 13 | +#' \item{ss_between}{Between-group sum of squares.} |
| 14 | +#' \item{ss_within}{Within-group sum of squares.} |
| 15 | +#' \item{ms_between}{Between-group mean square.} |
| 16 | +#' \item{ms_within}{Within-group mean square.} |
| 17 | +#' \item{F}{F-statistic.} |
| 18 | +#' \item{p_value}{Right-tail p-value from F distribution.} |
| 19 | +#' |
| 20 | +#' @details |
| 21 | +#' One-way ANOVA partitions total variance into between-group and within-group components: |
| 22 | +#' \deqn{SS_T = SS_B + SS_W.} |
| 23 | +#' The test statistic \eqn{F = MS_B / MS_W} follows an F distribution under H0 (equal means). |
| 24 | +#' |
| 25 | +#' @examples |
| 26 | +#' set.seed(0) |
| 27 | +#' x <- c(rnorm(20, 0, 1), rnorm(22, 0.2, 1), rnorm(18, -0.1, 1)) |
| 28 | +#' g <- factor(rep(c("A", "B", "C"), times = c(20, 22, 18))) |
| 29 | +#' res <- anova_oneway(x, g) |
| 30 | +#' res$F; res$p_value |
| 31 | +#' |
| 32 | +#' @export |
| 33 | +anova_oneway <- function(x, g) { |
| 34 | + if (!is.numeric(x)) stop("`x` must be numeric.") |
| 35 | + if (length(x) != length(g)) stop("`x` and `g` must have the same length.") |
| 36 | + if (!is.factor(g)) g <- factor(g) |
| 37 | + |
| 38 | + n <- length(x) |
| 39 | + k <- nlevels(g) |
| 40 | + if (k < 2L) stop("Need at least 2 groups.") |
| 41 | + if (n <= k) stop("Total observations must exceed number of groups.") |
| 42 | + |
| 43 | + grand_mean <- mean(x) |
| 44 | + |
| 45 | + # group stats |
| 46 | + group_means <- tapply(x, g, mean) |
| 47 | + group_sizes <- tapply(x, g, length) |
| 48 | + |
| 49 | + # SS_between = sum_{groups} n_j * (mean_j - grand_mean)^2 |
| 50 | + ss_between <- sum(group_sizes * (group_means - grand_mean)^2) |
| 51 | + |
| 52 | + # SS_within = sum over groups of sum (x_ij - mean_j)^2 |
| 53 | + ss_within <- sum(unlist(tapply(x, g, function(v) sum((v - mean(v))^2)))) |
| 54 | + |
| 55 | + df_between <- k - 1L |
| 56 | + df_within <- n - k |
| 57 | + |
| 58 | + ms_between <- ss_between / df_between |
| 59 | + ms_within <- ss_within / df_within |
| 60 | + |
| 61 | + F_stat <- ms_between / ms_within |
| 62 | + p_val <- stats::pf(F_stat, df_between, df_within, lower.tail = FALSE) |
| 63 | + |
| 64 | + list( |
| 65 | + df_between = df_between, |
| 66 | + df_within = df_within, |
| 67 | + ss_between = ss_between, |
| 68 | + ss_within = ss_within, |
| 69 | + ms_between = ms_between, |
| 70 | + ms_within = ms_within, |
| 71 | + F = F_stat, |
| 72 | + p_value = p_val |
| 73 | + ) |
| 74 | +} |
| 75 | + |
| 76 | +# --- Simple self-test (uncomment to run locally) --- |
| 77 | +# set.seed(1) |
| 78 | +# x <- c(rnorm(15, 0, 1), rnorm(15, 0.5, 1), rnorm(15, 1, 1)) |
| 79 | +# g <- factor(rep(c("A","B","C"), each = 15)) |
| 80 | +# res <- anova_oneway(x, g) |
| 81 | +# stopifnot(res$F > 0, res$p_value >= 0, res$p_value <= 1) |
0 commit comments