Updated GoF test

EricMarcon · web-flow · commit bb7795eca118 · 2025-05-27T16:09:55.000+02:00
Myllymärki et al., 2015; Mrkvicka et al., 2017.
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ importFrom("RcppParallel", "RcppParallelLibs")
 importFrom("rlang", ".data")
 importFrom("spatstat.geom", "marks")
 importFrom("doFuture", "%dofuture%")
+importFrom("stats", "quantile", "sd")
 
 export("as.Dtable")
 export("as.wmppp")
diff --git a/R/GoFtest.R b/R/GoFtest.R
@@ -1,9 +1,36 @@
-GoFtest <- function(Envelope) {
+GoFtest <- function(
+    Envelope,
+    Test = "DCLF",
+    Scaling = "Asymmetric",
+    Range = NULL) {
 
   # Verify Envelope
   if (!inherits(Envelope, "envelope")) {
     stop("Envelope is not of class envelope")
   }
+  # Verify Scaling
+  if (!is.character(Scaling) | !is.vector(Scaling) | !length(Scaling) == 1) {
+    stop("Argument 'Scaling' must be a character vector of length one")
+  }
+  if (!(Scaling %in% c("Quantile", "Studentized", "Asymmetric", "None"))) {
+    stop("Invalid argument: 'Scaling'. Accepted arguments are: Quantile,
+    Studentized, Asymmetric, None.")
+  }
+  # Verify Test
+  if (!is.character(Test) | !is.vector(Test) | !length(Test) == 1) {
+    stop("Argument 'Test' must be a character vector of length one")
+  }
+  if (!(Test %in% c("DCLF", "Integral", "MAD"))) {
+    stop("Invalid argument: 'Test'. Accepted arguments are: DCLF, Integral,
+         MAD.")
+  }
+  # Verify Range
+  if (!is.null(Range) && (!is.vector(Range) |
+                          !is.numeric(Range) |
+                          length(Range) != 2)) {
+    stop("Invalid argument: 'Range'. Accepted arguments are a numeric vector of length two,
+         specifying the minimum and maximum distances over which to compute the test.")
+  }
   # Verify simulations
   if (is.null(attr(Envelope, "simfuns"))) {
     stop("Envelope does not contain simulations in its attribute simfuns")
@@ -13,32 +40,93 @@ GoFtest <- function(Envelope) {
     SimulatedValues <- as.data.frame(attr(Envelope, "simfuns"))[, -1]
   }
 
+  # Transform observed Ls into K for the test (L envelopes are constructed from
+  # the K function)
+  if (any(attr(Envelope, "fname") %in% c("L", "Lmm"))) {
+    ActualValues <- (ActualValues + r)^2 * pi
+  }
+
+  # Restrict analysis to chosen distance range
+  if (!is.null(Range)) {
+    if(min(Range) < max(r) && max(Range) > min(r)) {
+      SelectedR <- (r > min(Range) & r < max(Range))
+      ActualValues <- ActualValues[SelectedR]
+      SimulatedValues <- SimulatedValues[SelectedR, ]
+      r <- r[SelectedR]
+    } else {
+      warning("The selected range is outside the simulated distances.
+              The test was computed using all distances from the envelope.")
+    }
+  }
+
   NumberOfSimulations <- dim(SimulatedValues)[2]
   AverageSimulatedValues <- apply(SimulatedValues, 1, sum) / (NumberOfSimulations - 1)
-  rIncrements <- (r - c(0,r)[seq_along(r)])[-1]
+  rIncrements <- (r - c(0, r)[seq_along(r)])[-1]
+
+  # Calculate weights to scale residuals of the statistic
+  Weights <- switch(Scaling,
+                    "Studentized" = 1 / apply(SimulatedValues, 1, sd, na.rm = T),
+                    "Quantile" = 1 / (apply(SimulatedValues, 1,
+                                          quantile, probs = 0.975, na.rm = T) -
+                                      apply(SimulatedValues, 1,
+                                            quantile, probs = 0.025, na.rm = T)),
+                    "Asymmetric" = {
+                      Upper <- 1 / (apply(SimulatedValues, 1,
+                                        quantile, probs = 0.975,na.rm = T) -
+                                    AverageSimulatedValues)
+                      Lower <- 1 / (AverageSimulatedValues -
+                                    apply(SimulatedValues, 1,
+                                          quantile, probs = 0.025, na.rm = T))
+                      list(UprW = Upper, LwrW = Lower)
+                    },
+                    "None" = rep(1, length(r)))
 
-  # Ui calculate the statistic for a simulation
-  Ui <- function(SimulationNumber) {
-    Departure <- (SimulatedValues[, SimulationNumber] -
-      AverageSimulatedValues)[seq_along(r) - 1]
-    WeightedDeparture <- (Departure[!is.nan(Departure)])^2 *
-      rIncrements[!is.nan(Departure)]
-    return(sum(WeightedDeparture))
+  # Ui calculate the statistic for one simulation
+  Ui <- function(SimulationNumber, ValueToTest) {
+    Departure <- (ValueToTest[, SimulationNumber] -
+                    AverageSimulatedValues)[seq_along(r) - 1]
+    if (inherits(Weights, "list")) {
+      ScaledDeparture <- sapply(seq_along(Departure),
+                                FUN= function(x) ifelse(Departure[x] >= 0,
+                                                        Departure[x] * Weights$UprW[x],
+                                                        Departure[x] * Weights$LwrW[x]))
+      ScaledDeparture <- as.vector(ScaledDeparture)
+    } else {
+      ScaledDeparture <- Departure * Weights[seq_along(r) - 1]
+    }
+    GofStatistic <- switch(Test,
+                           "DCLF" =
+                             sum((ScaledDeparture[!is.nan(ScaledDeparture)])^2 *
+                                   rIncrements[!is.nan(ScaledDeparture)],
+                                 na.rm = T),
+                           "Integral" =
+                             sum(abs((ScaledDeparture[!is.nan(ScaledDeparture)])) *
+                                   rIncrements[!is.nan(ScaledDeparture)],
+                                 na.rm = T),
+                           "MAD" = max(abs(ScaledDeparture), na.rm = T))
+    return(GofStatistic)
   }
 
   # Calculate the Ui statistic for all simulations
   SimulatedU <- vapply(
     seq_len(NumberOfSimulations),
     FUN = Ui,
-    FUN.VALUE = 0
+    FUN.VALUE = 0,
+    ValueToTest = SimulatedValues
   )
 
-  # Calculate the statistic for the actual value
-  RecenteredValues <- (ActualValues - AverageSimulatedValues)[seq_along(r) - 1]
-  WeightedRecenteredValues <- (RecenteredValues[!is.nan(RecenteredValues)])^2 *
-    rIncrements[!is.nan(RecenteredValues)]
-  ActualU <- sum(WeightedRecenteredValues)
+  # Calculate the Ui statistic for the actual value
+  ActualU <- vapply(
+    1,
+    FUN = Ui,
+    FUN.VALUE = 0,
+    ValueToTest = data.frame(ActualValues)
+  )
 
-  # Return the rank
-  return(mean(ActualU < SimulatedU))
+  # Return the p_value. If the p_value is equal to 0, a conservative p_value
+  # of 1/(n + 1) is returned.
+  return(ifelse(mean(ActualU < SimulatedU) == 0,
+                1 / (1 + NumberOfSimulations),
+                mean(ActualU < SimulatedU)))
 }
+
diff --git a/dbmss.Rproj b/dbmss.Rproj
@@ -1,5 +1,5 @@
 Version: 1.0
-ProjectId: cd05f60b-7ee9-4c31-a1ff-518244563ba2
+ProjectId: 9bfc979a-9f1d-4b54-90ce-a8660d10f9b2
 
 RestoreWorkspace: Default
 SaveWorkspace: Default
diff --git a/man/GoFtest.Rd b/man/GoFtest.Rd
@@ -7,35 +7,75 @@
   Calculates the risk to reject the null hypothesis erroneously, based on the distribution of the simulations.
 }
 \usage{
-GoFtest(Envelope)
+GoFtest(Envelope,
+        Test = "DCLF",
+        Scaling = "Asymmetric",
+        Range = NULL)
 }
 \arguments{
   \item{Envelope}{
   An envelope object (\code{\link[spatstat.explore]{envelope}}) containing simulations in its \code{simfuns} attribute. It may be the result of any estimation function of the dbmss package or obtained by the \code{\link[spatstat.explore]{envelope}} function with argument \code{savefuns=TRUE}.
   }
+  \item{Test}{
+  A string specifying the method to summarize the deviation from the null hypothesis. The deviation may be summarized using
+  "\emph{DCLF}": the integrated squared deviation is utilized, a Diggle-Cressie-Loosmore-Ford (DCLF) test is performed (default);
+  "\emph{Integral}": the integrated absolute deviation is utilized;
+  "\emph{MAD}": the Maximum Absolute Deviation (MAD) is utilized, a MAD test is perfomed.
+  }
+  \item{Scaling}{
+  A string specifying the method to scale the residuals of the test. Scaling may be
+  "\emph{Asymmetric}": the differences between the 2.5\% lower quantiles of simulations and the expected value is utilized to scale negative residuals, and the differences between the 2.5\% upper quantiles and the expected value is utilized to scale positive residuals (default);
+  "\emph{Quantile}": ranges between the 2.5\% upper and 2.5\% lower quantiles is utilized to scale residuals, disregarding whether residuals are negative or positive;
+  "\emph{Studentized}": the standard deviation of simulations is utilized;
+  "\emph{None}": does not scale residuals.
+  }
+  \item{Range}{
+  A vector of length two containing the minimum and the maximum distance over which to compute the test. If \code{NULL}, or if the selected range is outside the simulated distances, all distances in the \code{Envelope} argument are used.
+  }
 }
 \details{
-  This test was introduced by Diggle(1983) and extensively developped by Loosmore and Ford (2006) for \emph{K}, and applied to \emph{M} by Marcon et al. (2012).
+  This function regroups multiple Goodness of Fit tests: the DCLF test (Diggle 1986, Cressie 1993, Loosmore & Ford 2006, Marcon et al. 2012, Myllymäki et al. 2015), the integrated deviation test (Diggle 1979), and the MAD test (Diggle 1979, Myllymäki et al. 2015).
+
+  Monte Carlo simulations assess how well observed distance-based measures of spatial structure align with expected measures under the null hypothesis, estimated here by the mean of simulations. For both observed and simulated measures, residuals — calculated as the differences between observed and expected values — are computed at each distance r.
+  These residuals are then transformed into test-specific statistics \emph{u}: \code{Test = "MAD"} uses the maximum absolute residual; \code{Test = "Integral"}, and \code{Test = "DCLF"} use residuals to approximate the integrated deviation, and the integrated squared deviation.
+  A rank test on \emph{u} evaluates the null hypothesis that the observed point pattern originates from the same point process as the simulations.
+
+  The unequal variance of the residuals at different intervals of r influences \emph{u} statistics, and the power of Goodness of Fit tests. Myllymäki et al. (2015) proposed to scale residuals using pointwise quantiles (\code{Scaling = "Asymmetric"}, and \code{Scaling = "Quantile"}), and pointwise standard deviations (\code{Scaling = "Studentized"}).
+  Goodness of Fit tests are sensitive to the distance interval over which they are performed. It is recommended to choose the distance interval to test based on \emph{a priori} hypotheses (Wiegand & Moloney 2013, Baddeley et al. 2015).
 }
 \value{
   A p-value.
 }
 \references{
-  Diggle, P. J. (1983). \emph{Statistical analysis of spatial point patterns}. Academic Press, London. 148 p.
-  
-  Loosmore, N. B. and  Ford, E. D. (2006). Statistical inference using the G or K point pattern spatial statistics. \emph{Ecology} 87(8): 1925-1931.
-  
-  Marcon, E., F. Puech and S. Traissac (2012). Characterizing the relative spatial structure of point patterns. International \emph{Journal of Ecology} 2012(Article ID 619281): 11.  
+  Baddeley, A., Rubak, E., & Turner, R. (2015). Spatial Point Patterns: Methodology and Applications with R (0 ed.). Chapman and Hall/CRC. https://doi.org/10.1201/b19708
+
+  Cressie, N. A. C. (1993). Statistics for Spatial Data (1st ed.). Wiley. https://doi.org/10.1002/9781119115151
+
+  Diggle, P. J. (1979). On Parameter Estimation and Goodness-of-Fit Testing for Spatial Point Patterns. Biometrics, 35(1), 87. https://doi.org/10.2307/2529938
+
+  Diggle, P. J. (1986). Displaced amacrine cells in the retina of a rabbit: Analysis of a bivariate spatial point pattern. Journal of Neuroscience Methods, 18(1–2), 115–125. https://doi.org/10.1016/0165-0270(86)90115-9
+
+  Loosmore, N. B., & Ford, E. D. (2006). Statistical inference using the G or K point pattern spatial statistics. Ecology, 87(8), 1925–1931. https://doi.org/10.1890/0012-9658(2006)87[1925:SIUTGO]2.0.CO;2
+
+  Marcon, E., Puech, F., & Traissac, S. (2012). Characterizing the Relative Spatial Structure of Point Patterns. International Journal of Ecology, 2012, 1–11. https://doi.org/10.1155/2012/619281
+
+  Myllymäki, M., & Mrkvička, T. (2024). GET: Global Envelopes in R. Journal of Statistical Software, 111(3). https://doi.org/10.18637/jss.v111.i03
+
+  Myllymäki, M., Grabarnik, P., Seijo, H., & Stoyan, D. (2015). Deviation test construction and power comparison for marked spatial point patterns. Spatial Statistics, 11, 19–34. https://doi.org/10.1016/j.spasta.2014.11.004
+
+  Wiegand, T., & Moloney, K. A. (2013). Handbook of Spatial Point-Pattern Analysis in Ecology (0 ed.). Chapman and Hall/CRC. https://doi.org/10.1201/b16195
+
 }
 \note{
-  No support exists in the literature to apply the GoF test to non-cumulative functions (\emph{g}, \emph{Kd}...).
-  
   \code{\link{Ktest}} is a much better test (it does not rely on simulations) but it is limited to the \emph{K} function against complete spatial randomness (CSR) in a rectangle window.
+
+  This test is inspired from Myllymäki et al. (2015), and a similar function \code{deviation_test()} exists in the R package \emph{GET} (Myllymäki & Mrkvička, 2024)
 }
 \seealso{
   \code{\link{Ktest}}
 }
 \examples{
+
 # Simulate a Matern (Neyman Scott) point pattern
 nclust <- function(x0, y0, radius, n) {
   return(runifdisc(n, radius, centre=c(x0, y0)))
@@ -50,6 +90,12 @@ Alpha <- .10
 Envelope <- KEnvelope(as.wmppp(X), r, NumberOfSimulations, Alpha)
 autoplot(Envelope, ./(pi*r^2) ~ r)
 
-# GoF test. Power is correct if enough simulations are run (say >1000).
-paste("p-value =", GoFtest(Envelope))
+# DCLF test using asymmetric scaling.
+# Power is correct if enough simulations are run (say >1000).
+paste("p-value =", GoFtest(Envelope, Test = "DCLF",
+                           Scaling = "Asymmetric", Range = c(0.1, 0.2)))
+
+# MAD test using asymmetric scaling.
+paste("p-value =", GoFtest(Envelope, Test = "MAD",
+                           Scaling = "Asymmetric", Range = c(0.1, 0.2)))
 }