Merge pull request #169 from mrc-ide/sero_fitting

OJWatson · web-flow · commit f16e8715872b · 2021-08-09T17:02:35.000+01:00
sero_df checking and handling correctly
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: squire
 Type: Package
 Title: SEIR transmission model of COVID-19
-Version: 0.6.7
+Version: 0.6.8
 Authors@R: c(
   person("OJ", "Watson", email = "o.watson15@imperial.ac.uk", role = c("aut", "cre")),
   person("Patrick", "Walker", email = "patrick.walker06@imperial.ac.uk", role = c("aut")),
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# squire 0.6.8
+
+* `run_deterministic_comparison` patches for sero fitting and argument checks
+
 # squire 0.6.7
 
 * `pmcmc` can now be used to fit to serology data (deterministic model only) 
diff --git a/R/particle.R b/R/particle.R
@@ -361,7 +361,7 @@ compare_output <- function(model, pars_obs, data, type="explicit_SEEIR_model") {
       log_weights <- log_weights +
         ll_nbinom(data$deaths[t], model_deaths, phi_death, k_death, exp_noise)
 
-      }
+    }
 
     # We are not going to be bringing cases in so comment this out
 
@@ -484,9 +484,9 @@ intervention_dates_for_odin <- function(dates,
     dates <- dates[include]
     change <- change[include]
 
-  # if start date is in the middle of our dates but not incldued
-  # then remove all earlier dates and change the last date before the
-  # start date to the start date
+    # if start date is in the middle of our dates but not incldued
+    # then remove all earlier dates and change the last date before the
+    # start date to the start date
   } else if (any(start_date >= dates)) {
 
     # which are before the start date
@@ -501,8 +501,8 @@ intervention_dates_for_odin <- function(dates,
 
     dates[1] <- as.Date(start_date)
 
-  # if all the dates are after the start date then add the start date
-  # and we assume the first change value is starting_change
+    # if all the dates are after the start date then add the start date
+    # and we assume the first change value is starting_change
   } else {
 
     extra_start <- seq.Date(start_date, dates[1]-1, 1)
@@ -716,11 +716,14 @@ run_deterministic_comparison <- function(data,
 
   # calculate ll for the seroprevalence
   lls <- 0
-  if("sero_df" %in% obs_params && "sero_det" %in% obs_params) {
+  if("sero_df" %in% names(obs_params) && "sero_det" %in% names(obs_params)) {
 
     sero_df <- obs_params$sero_df
     sero_det <- obs_params$sero_det
 
+    # put some checks here that sero_df is correctly formatted
+    check_sero_df(sero_df)
+
     # were there actually seroprevalence data points to compare against
     if(nrow(sero_df) > 0) {
 
@@ -736,22 +739,22 @@ run_deterministic_comparison <- function(data,
 
       }
 
-    # get symptom incidence
-    symptoms <- rowSums(out[,index$E2]) * model_params$gamma_E
+      # get symptom incidence
+      symptoms <- rowSums(out[,index$E2]) * model_params$gamma_E
 
-    # dates of incidence, pop size and dates of sero surveys
-    dates <- data$date[[1]] + seq_len(nrow(out)) - 1L
-    N <- sum(model_params$population)
-    sero_dates <- list(sero_df$date_end, sero_df$date_start, sero_df$date_start + as.integer((sero_df$date_end - sero_df$date_start)/2))
-    unq_sero_dates <- unique(c(sero_df$date_end, sero_df$date_start, sero_df$date_start + as.integer((sero_df$date_end - sero_df$date_start)/2)))
-    det <- obs_params$sero_det
+      # dates of incidence, pop size and dates of sero surveys
+      dates <- data$date[[1]] + seq_len(nrow(out)) - 1L
+      N <- sum(model_params$population)
+      sero_dates <- list(sero_df$date_end, sero_df$date_start, sero_df$date_start + as.integer((sero_df$date_end - sero_df$date_start)/2))
+      unq_sero_dates <- unique(c(sero_df$date_end, sero_df$date_start, sero_df$date_start + as.integer((sero_df$date_end - sero_df$date_start)/2)))
+      det <- obs_params$sero_det
 
-    # estimate model seroprev
-    sero_model <- vapply(unq_sero_dates, sero_at_date, numeric(1), symptoms, det, dates, N)
-    sero_model_mat <- do.call(cbind,lapply(sero_dates, function(x) {sero_model[match(x, unq_sero_dates)]}))
+      # estimate model seroprev
+      sero_model <- vapply(unq_sero_dates, sero_at_date, numeric(1), symptoms, det, dates, N)
+      sero_model_mat <- do.call(cbind,lapply(sero_dates, function(x) {sero_model[match(x, unq_sero_dates)]}))
 
-    # likelihood of model obvs
-    lls <- rowMeans(dbinom(sero_df$sero_pos, sero_df$samples, sero_model_mat, log = TRUE))
+      # likelihood of model obvs
+      lls <- rowMeans(dbinom(sero_df$sero_pos, sero_df$samples, sero_model_mat, log = TRUE))
 
     }
 
@@ -784,3 +787,15 @@ run_deterministic_comparison <- function(data,
 
   ret
 }
+
+
+#' @noRd
+check_sero_df <- function(sero_df) {
+
+  assert_date(sero_df$date_start)
+  assert_date(sero_df$date_end)
+  assert_pos_int(sero_df$sero_pos)
+  assert_pos_int(sero_df$samples)
+  assert_le(sero_df$sero_pos, sero_df$samples)
+
+}
diff --git a/README.Rmd b/README.Rmd
@@ -144,7 +144,7 @@ This plot will plot each of the compartments of the model output. We can also pl
 plot(r, var_select = c("E", "IMild"))
 ```
 
-Or, you can specify one of `deaths`, `infections`, `hospital_occupancy`, `ICU_occupancy`, `hospital_demand` or `ICU_demand`, and plot these summary metrics that represent the combintion of a number of different compartment e.g:
+Or, you can specify one of `deaths`, `infections`, `hospital_occupancy`, `ICU_occupancy`, `hospital_demand` or `ICU_demand`, and plot these summary metrics that represent the combination of a number of different compartment e.g:
 
 ```{r subset variables plot2}
 plot(r, var_select = "deaths")
@@ -178,7 +178,7 @@ length of simulation and the timestep. For a full list of model inputs, please
 see the function [documentation](https://mrc-ide.github.io/squire/reference/run_explicit_SEEIR_model.html)
 
 For example, changing the initial R0 (default = 3), number of replicates (
-default = 10), simualtion length (default = 365 days) and time step (default = 
+default = 10), simulation length (default = 365 days) and time step (default = 
 0.5 days), as well as setting the population and contact matrix manually:
 
 ```{r set params}
@@ -201,7 +201,7 @@ plot(r)
 ```
 
 We can also change the R0 and contact matrix at set time points, to reflect 
-changing behaviour resulting from interventions. For example to set a 80%
+changing behaviour resulting from interventions. For example to set an 80%
 reduction in the contact matrix after 100 days :
 
 ```{r set contact matrix decrease}
@@ -356,7 +356,7 @@ three elements in `out` being the simulation outputs, the model and model parame
 plot(out, "deaths", date_0 = max(df$date), x_var = "date")
 ```
 
-With default parameters, `calibrate` will simulate up the maximum date in the data 
+With default parameters, `calibrate` will simulate up to the maximum date in the data 
 provided. The fit to this data can be shown using the plotting function and specifying `particle_fit`
 to be `TRUE`
 
@@ -380,7 +380,7 @@ plot(out$scan_results, what = "probability")
 
 The reason for the poor fits to the data shown earlier is because Algeria has implemented
 interventions prior to today. These can also be incorporated into `calibrate`. 
-For example, we can grab the assumed changes to transmission fased on government intervention
+For example, we can grab the assumed changes to transmission based on government intervention
 for Algeria.
 
 ```{r dza interventions}
@@ -420,7 +420,7 @@ plot(out, particle_fit = TRUE)
 That is a much better fit. 
 
 Any parameter that you could provide to `run_explicit_SEEIR_model` can be passed to `calibrate`. This
-includes time varying arguments such as `contact_matrix_set`, `ICU_bed_capacity` and `hosp_bed_capacity`. To incorporate these into model fitting correctly, the date at which these change must be provided (similarly to how `date_R0_change` was provided above) using `date_ICU_bed_capacity_change`, `date_ICU_bed_capacity_change` and `date_hosp_bed_capacity_change` respecitvely. In addition, the user must provide a baseline value for these, i.e. the contact matrix and bed capacity at the beginning of the epidemic:
+includes time varying arguments such as `contact_matrix_set`, `ICU_bed_capacity` and `hosp_bed_capacity`. To incorporate these into model fitting correctly, the date at which these change must be provided (similarly to how `date_R0_change` was provided above) using `date_ICU_bed_capacity_change`, `date_ICU_bed_capacity_change` and `date_hosp_bed_capacity_change` respectively. In addition, the user must provide a baseline value for these, i.e. the contact matrix and bed capacity at the beginning of the epidemic:
 
 ```{r particle with ints of all kinds, warning=FALSE, message=FALSE}
 out <- calibrate(
@@ -510,7 +510,7 @@ ggproj + ggplot2::geom_hline(yintercept = out$parameters$ICU_bed_capacity)
 
 We can see above that the intervention introduced is nearly sufficient to prevent ICU demand (solid line red) from exceeding the supply, whereas in the unmitigated strategy this did occur.
 
-We can also model changing interventions by changing the contact matrix over time as well as the availability of ICU and hospital beds. E.g. decreasing contacts by 75% in a week before relaxing it to 80% in 30 days time, while increasing hospital and ICU beds by 20% in 30 days time. (N.B. We can turn off the automatic scenario parameter labelling with `add_parms_to_scenarios = FALSE`):
+We can also model changing interventions by changing the contact matrix over time as well as the availability of ICU and hospital beds. E.g. decreasing contacts by 75% in a week before relaxing it to 80% in 30 days time, while increasing hospital and ICU beds by 200% in 30 days time. (N.B. We can turn off the automatic scenario parameter labelling with `add_parms_to_scenarios = FALSE`):
 
 ```{r projection relative beds}
 # create our projections
diff --git a/README.md b/README.md
@@ -187,7 +187,7 @@ plot(r, var_select = c("E", "IMild"))
 <img src="man/figures/README-subset variables plot-1.png" width="100%" />
 Or, you can specify one of `deaths`, `infections`, `hospital_occupancy`,
 `ICU_occupancy`, `hospital_demand` or `ICU_demand`, and plot these
-summary metrics that represent the combintion of a number of different
+summary metrics that represent the combination of a number of different
 compartment
 e.g:
 
@@ -251,7 +251,7 @@ full list of model inputs, please see the function
 [documentation](https://mrc-ide.github.io/squire/reference/run_explicit_SEEIR_model.html)
 
 For example, changing the initial R0 (default = 3), number of replicates
-( default = 10), simualtion length (default = 365 days) and time step
+( default = 10), simulation length (default = 365 days) and time step
 (default = 0.5 days), as well as setting the population and contact
 matrix manually:
 
@@ -282,7 +282,7 @@ plot(r)
 
 We can also change the R0 and contact matrix at set time points, to
 reflect changing behaviour resulting from interventions. For example to
-set a 80% reduction in the contact matrix after 100 days :
+set an 80% reduction in the contact matrix after 100 days :
 
 ``` r
 
@@ -306,7 +306,7 @@ plot(r, var_select = "infections")
 
 where `n_E2_I` is the daily number of new infections.
 
-To show an 80% reduction after 50 days but only maintained for 30 days :
+To show an 80% reduction after 80 days but only maintained for 40 days :
 
 ``` r
 
@@ -329,7 +329,7 @@ plot(r, var_select = "infections")
 
 <img src="man/figures/README-set contact matrix decrease and relax-1.png" width="100%" />
 
-Alternatively, we could set a changing R0, which falls below 1 after 50
+Alternatively, we could set a changing R0, which falls below 1 after 80
 days:
 
 ``` r
@@ -501,7 +501,7 @@ plot(out, "deaths", date_0 = max(df$date), x_var = "date")
 
 <img src="man/figures/README-plot particle deaths-1.png" width="100%" />
 
-With default parameters, `calibrate` will simulate up the maximum date
+With default parameters, `calibrate` will simulate up to the maximum date
 in the data provided. The fit to this data can be shown using the
 plotting function and specifying `particle_fit` to be `TRUE`
 
@@ -533,7 +533,7 @@ plot(out$scan_results, what = "probability")
 The reason for the poor fits to the data shown earlier is because
 Algeria has implemented interventions prior to today. These can also be
 incorporated into `calibrate`. For example, we can grab the assumed
-changes to transmission fased on government intervention for
+changes to transmission based on government intervention for
 Algeria.
 
 ``` r
@@ -590,7 +590,7 @@ incorporate these into model fitting correctly, the date at which these
 change must be provided (similarly to how `date_R0_change` was provided
 above) using `date_ICU_bed_capacity_change`,
 `date_ICU_bed_capacity_change` and `date_hosp_bed_capacity_change`
-respecitvely. In addition, the user must provide a baseline value for
+respectively. In addition, the user must provide a baseline value for
 these, i.e. the contact matrix and bed capacity at the beginning of the
 epidemic:
 
@@ -725,7 +725,7 @@ whereas in the unmitigated strategy this did occur.
 We can also model changing interventions by changing the contact matrix
 over time as well as the availability of ICU and hospital beds. E.g.
 decreasing contacts by 75% in a week before relaxing it to 80% in 30
-days time, while increasing hospital and ICU beds by 20% in 30 days
+days time, while increasing hospital and ICU beds by 200% in 30 days
 time. (N.B. We can turn off the automatic scenario parameter labelling
 with `add_parms_to_scenarios = FALSE`):
 
diff --git a/tests/testthat/test-pmcmc.R b/tests/testthat/test-pmcmc.R
@@ -1703,6 +1703,8 @@ test_that("sero fitting works", {
   pars_obs$sero_df <- sero_df
   pars_obs$sero_det <- sero_det
 
+  # following checks to see that it is being correctly used to get better likelihoods
+  # given contribution from the sero ll
   Sys.setenv("SQUIRE_PARALLEL_DEBUG"=TRUE)
   out <- pmcmc(data = data,
                n_mcmc = 5,
@@ -1753,4 +1755,73 @@ test_that("sero fitting works", {
 
   expect_s3_class(plot(out, what = "deaths", particle_fit = TRUE), "gg")
 
+
+  # and checks that the sero_df date is correctly formatted
+  pars_obs$sero_df$date_end <- factor(pars_obs$sero_df$date_end)
+  expect_error(pmcmc(data = data,
+                n_mcmc = 5,
+                log_likelihood = NULL,
+                log_prior = NULL,
+                n_particles = 2,
+                steps_per_day = steps_per_day,
+                output_proposals = FALSE,
+                n_chains = 1,
+                replicates = 20,
+                burnin = 5,
+                squire_model = squire_model,
+                pars_init = pars_init,
+                pars_min = pars_min,
+                pars_max = pars_max,
+                pars_discrete = pars_discrete,
+                pars_obs = pars_obs,
+                proposal_kernel = proposal_kernel,
+                R0_change = R0_change,
+                date_R0_change = date_R0_change,
+                country = country),
+               "date_end must be a date or ISO-formatted string")
+
+
 })
+
+
+#------------------------------------------------
+test_that("sero df checking works", {
+
+sero_df <- data.frame("samples" = 1000, "sero_pos" = 10,
+                      "date_start" = as.Date("2020-04-15"),
+                      "date_end" = as.Date("2020-04-19"))
+
+sero_df_fail <- sero_df
+
+# date_start
+sero_df_fail <- sero_df
+sero_df_fail$date_start <- factor(sero_df_fail$date_start)
+expect_error(check_sero_df(sero_df_fail),
+             "date_start must be a date or ISO-formatted string")
+
+# date end
+sero_df_fail <- sero_df
+sero_df_fail$date_end <- factor(sero_df_fail$date_end)
+expect_error(check_sero_df(sero_df_fail),
+             "date_end must be a date or ISO-formatted string")
+
+# integers
+sero_df_fail <- sero_df
+sero_df_fail$samples <- 101010.123
+expect_error(check_sero_df(sero_df_fail),
+             "samples must be integer valued")
+
+# integers
+sero_df_fail <- sero_df
+sero_df_fail$sero_pos <- 10.123
+expect_error(check_sero_df(sero_df_fail),
+             "sero_pos must be integer valued")
+
+# comparators
+sero_df_fail <- sero_df
+sero_df_fail$sero_pos <- 10000
+expect_error(check_sero_df(sero_df_fail),
+             "sero_pos must be less than")
+
+})
+