UCD-SERG · Copilot · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 14, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,2 @@
 NEWS.md merge=union
+man/* linguist-generated
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -244,6 +244,26 @@ To regenerate:
 rmarkdown::render("README.Rmd")
 ```
 
+### Version Management
+
+**CRITICAL**: Always ensure the development version in your PR branch is one version number higher than the main branch.
+
+```r
+# Check current version
+desc::desc_get_version()
+
+# Increment development version (use this for PRs)
+usethis::use_version('dev')
+```
+
+**Version Check Workflow**: The `version-check.yaml` workflow will fail if your PR branch version is not higher than the main branch version. Before requesting PR review, always:
+
+1. Check the current version on the main branch (look at DESCRIPTION on main)
+2. Ensure your PR branch version is at least one development version higher
+3. If main is at 1.4.0.9003, your PR should be at minimum 1.4.0.9004
+
+**Why this matters**: This ensures proper version tracking and prevents conflicts when multiple PRs are merged.
+
 ### Package Checking
 
 Run R CMD check to validate the package:
@@ -469,6 +489,54 @@ expect_false(has_missing_values(complete_data))
 4. **Run tests**: Validate all tests pass, updating snapshots only when changes are intentional
 5. **Review snapshots**: When snapshots change, review the diff to ensure changes are expected
 
+## Code Organization Policies
+
+**CRITICAL**: Follow these strict code organization policies for all new code and refactoring work:
+
+### File Organization
+
+1. **One function per file**: Each exported function and its associated S3 methods should be in its own file
+   - File name should match the function name (e.g., `summary.seroincidence.R` for `summary.seroincidence()`)
+   - S3 methods for the same generic can be in the same file (e.g., `compare_seroincidence.seroincidence()`, `compare_seroincidence.seroincidence.by()`, and `compare_seroincidence.default()` all in `compare_seroincidence.R`)
+
+2. **Internal helper functions**: Move to separate files
+   - Use descriptive file names (e.g., `compute_cluster_robust_var.R` for `.compute_cluster_robust_var()`)
+   - Keep related internal functions together when logical
+   - Internal functions should use `.function_name()` naming convention
+
+3. **Print methods**: Each print method in its own file
+   - File name: `print.{class_name}.R` (e.g., `print.seroincidence.R`)
+
+4. **Extract anonymous functions**: Convert complex anonymous functions to named helper functions in separate files
+   - If an anonymous function is longer than ~5 lines, extract it
+   - Name should describe its purpose (e.g., `.helper_function_name()`)
+
+### Example Organization
+
+1. **Long examples**: Move to `inst/examples/exm-{function_name}.R`
+   - Use `@example inst/examples/exm-{function_name}.R` in roxygen documentation
+   - Keep inline `@examples` short (1-3 lines) for simple demonstrations
+
+2. **Example file naming**: `exm-{function_name}.R`
+   - Example: `exm-est_seroincidence.R` for `est_seroincidence()` examples
+
+### Benefits
+
+- **Easier navigation**: Find functions quickly by file name
+- **Better git history**: Changes to one function don't pollute history of unrelated functions
+- **Clearer code review**: Reviewers can focus on individual functions
+- **Reduced merge conflicts**: Multiple people can work on different functions simultaneously
+- **Better organization**: Logical structure makes codebase more maintainable
+
+### Migration Strategy
+
+When refactoring existing code:
+1. Extract functions to separate files
+2. Update any internal calls if needed
+3. Run `devtools::document()` to regenerate documentation
+4. Run `devtools::check()` to ensure no breakage
+5. Run tests to verify functionality unchanged
+
 ## Code Style Guidelines
 
 - **Follow tidyverse style guide**: https://style.tidyverse.org

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: serocalculator
 Title: Estimating Infection Rates from Serological Data
-Version: 1.4.0.9003
+Version: 1.4.0.9004
-Version: 1.4.0.9004
+Version: 1.4.0.9002
-Version: 1.4.0.9004
+Version: 1.4.0.9002
 Authors@R: c(
     person("Kristina", "Lai", , "kwlai@ucdavis.edu", role = c("aut", "cre")),
     person("Chris", "Orwa", role = "aut"),
@@ -77,7 +77,7 @@ Encoding: UTF-8
 Language: en-US
 LazyData: true
 NeedsCompilation: no
-Roxygen: list(markdown = TRUE, roclets = c("collate", "rd", "namespace", "devtag::dev_roclet"))
+Roxygen: list(markdown = TRUE, roclets = c("collate", "rd", "namespace"))
 RoxygenNote: 7.3.3
 Remotes: 
     d-morrison/snapr
diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,42 @@
 # serocalculator (development version)
 
+## New features
+
+* Added `cluster_var` and `stratum_var` parameters to `est_seroincidence()` and 
+  `est_seroincidence_by()` to support cluster-robust standard error estimation. 
+  When `cluster_var` is specified, `summary.seroincidence()` automatically computes 
+  cluster-robust (sandwich) variance estimates to account for within-cluster 
+  correlation in clustered sampling designs such as household or school-based surveys.
+* `cluster_var` parameter now accepts multiple variables (e.g., `c("school", "classroom")`)
+  for multi-level clustered sampling designs. Cluster-robust standard errors will account
+  for all specified clustering levels.
+
+## Bug fixes
+
+* Fixed column naming issue in `summary.seroincidence()` where cluster-robust standard
+  errors caused `[]` notation in column names (`SE[,1]` instead of `SE`).
+* Added `se_type` column to `summary.seroincidence()` output to clearly indicate whether
+  "standard" or "cluster-robust" standard errors are being used.
+* Fixed `est_seroincidence_by()` to properly pass cluster and stratum variables through
+  to stratified analyses. Previously, these variables were dropped during data stratification,
+  causing errors when trying to use clustering with `est_seroincidence_by()`.
+
+## Code organization
+
+* Refactored clustering-related code following package organization policies:
+  - Moved `.compute_cluster_robust_var()` to `R/compute_cluster_robust_var.R`
+  - Each function now in its own file for better maintainability and git history
+* Updated copilot-instructions.md with code organization policies
+
 # serocalculator 1.4.0
 
 ## New features
 
+* Added support for cluster-robust standard errors in `est_seroincidence()` through
+  new `cluster_var` and `stratum_var` parameters. When `cluster_var` is specified,
+  `summary.seroincidence()` automatically computes cluster-robust (sandwich) variance
+  estimates to account for within-cluster correlation in clustered sampling designs
+  such as household or school-based surveys.
 * Added `compare_seroincidence()` function for statistical comparison of seroincidence rates
   - Performs two-sample z-tests to compare seroincidence estimates
   - Returns `htest` format when comparing two single estimates

diff --git a/R/compute_cluster_robust_var.R b/R/compute_cluster_robust_var.R
@@ -0,0 +1,119 @@
+#' Compute cluster-robust variance for seroincidence estimates
+#'
+#' @description
+#' Computes cluster-robust (sandwich) variance estimates for seroincidence
+#' parameter estimates when data come from a clustered sampling design.
+#' This adjusts the standard errors to account for within-cluster correlation.
+#'
+#' @param fit a `seroincidence` object from [est_seroincidence()]
+#' @param cluster_var name(s) of the cluster variable(s) in the data.
+#'   Can be a single variable or vector of variables for multi-level clustering.
+#' @param stratum_var optional name of the stratum variable
+#'
+#' @return variance of log(lambda) accounting for clustering
+#' @keywords internal
+#' @noRd
+.compute_cluster_robust_var <- function(
+    fit,
+    cluster_var,
+    stratum_var = NULL) {
+  # Extract stored data (already split by antigen_iso)
+  pop_data_list <- attr(fit, "pop_data")
+  sr_params_list <- attr(fit, "sr_params")
+  noise_params_list <- attr(fit, "noise_params")
+  antigen_isos <- attr(fit, "antigen_isos")
+
+  # Get MLE estimate
+  log_lambda_mle <- fit$estimate
+
+  # Combine pop_data list back into a single data frame
+  # to get cluster info
+  pop_data_combined <- do.call(rbind, pop_data_list)
+
+  # Compute score (gradient) using numerical differentiation
+  # The score is the derivative of log-likelihood w.r.t. log(lambda)
+  epsilon <- 1e-6
+
+  # For each observation, compute the contribution to the score
+  # We need to identify which cluster each observation belongs to
+
+  # Handle multiple clustering levels by creating composite cluster ID
+  if (length(cluster_var) == 1) {
+    cluster_ids <- pop_data_combined[[cluster_var]]
+  } else {
+    # Create composite cluster ID from multiple variables
+    cluster_ids <- interaction(
+      pop_data_combined[, cluster_var, drop = FALSE],
+      drop = TRUE,
+      sep = "_"
+    )
+  }
+
+  # Get unique clusters
+  unique_clusters <- unique(cluster_ids)
+  n_clusters <- length(unique_clusters)
+
+  # Compute cluster-level scores
+  cluster_scores <- numeric(n_clusters)
+
+  for (i in seq_along(unique_clusters)) {
+    cluster_id <- unique_clusters[i]
+
+    # Get observations in this cluster
+    cluster_mask <- cluster_ids == cluster_id
+
+    # Create temporary pop_data with only this cluster
+    pop_data_cluster <- pop_data_combined[cluster_mask, , drop = FALSE]
+
+    # Split by antigen
+    pop_data_cluster_list <- split(
+      pop_data_cluster,
+      pop_data_cluster$antigen_iso
+    )
+
+    # Ensure all antigen_isos are represented
+    # (add empty data frames if missing)
+    for (ag in antigen_isos) {
+      if (!ag %in% names(pop_data_cluster_list)) {
+        # Create empty data frame with correct structure
+        pop_data_cluster_list[[ag]] <- pop_data_list[[ag]][0, , drop = FALSE]
+      }
+    }
+
+    # Compute log-likelihood for this cluster at MLE
+    ll_cluster_mle <- -(.nll(
+      log.lambda = log_lambda_mle,
+      pop_data = pop_data_cluster_list,
+      antigen_isos = antigen_isos,
+      curve_params = sr_params_list,
+      noise_params = noise_params_list,
+      verbose = FALSE
+    ))
+
+    # Compute log-likelihood at MLE + epsilon
+    ll_cluster_plus <- -(.nll(
+      log.lambda = log_lambda_mle + epsilon,
+      pop_data = pop_data_cluster_list,
+      antigen_isos = antigen_isos,
+      curve_params = sr_params_list,
+      noise_params = noise_params_list,
+      verbose = FALSE
+    ))
+
+    # Numerical derivative (score for this cluster)
+    cluster_scores[i] <- (ll_cluster_plus - ll_cluster_mle) / epsilon
+  }
+
+  # Compute B matrix (middle of sandwich)
+  # B = sum of outer products of cluster scores
+  b_matrix <- sum(cluster_scores^2) # nolint: object_name_linter
+
+  # Get Hessian (already computed by nlm)
+  h_matrix <- fit$hessian # nolint: object_name_linter
+
+  # Sandwich variance: V = H^(-1) * B * H^(-1)
+  # Since we have a scalar parameter, this simplifies to:
+  var_log_lambda_robust <- b_matrix / (h_matrix^2)
+
+  return(var_log_lambda_robust)
+}