Added covid data loader; use random number seed

fonnesbeck · fonnesbeck · commit a59376c541d6 · 2025-06-17T09:54:01.000-05:00
diff --git a/examples/case_studies/bayesian_workflow.ipynb b/examples/case_studies/bayesian_workflow.ipynb
diff --git a/examples/case_studies/bayesian_workflow.myst.md b/examples/case_studies/bayesian_workflow.myst.md
@@ -44,9 +44,9 @@ import seaborn as sns
 warnings.simplefilter("ignore")
 
 sns.set_context("talk")
-# plt.style.use('seaborn-whitegrid')
 
-sampler_kwargs = {"chains": 4, "cores": 4, "tune": 2000}
+RANDOM_SEED = 8451997
+sampler_kwargs = {"chains": 4, "cores": 4, "tune": 2000, "random_seed": RANDOM_SEED}
 ```
 
 Strengths of Bayesian statistics that are critical here:
@@ -226,7 +226,7 @@ with pm.Model() as model_exp2:
 
 ```{code-cell} ipython3
 with model_exp2:
-    prior_pred = pm.sample_prior_predictive()
+    prior_pred = pm.sample_prior_predictive(random_seed=RANDOM_SEED)
 
 fig, ax = plt.subplots(figsize=(12, 8))
 ax.plot(prior_pred.prior_predictive["obs"].values.squeeze().T, color="0.5", alpha=0.1)
@@ -269,7 +269,7 @@ with pm.Model() as model_exp3:
     # Likelihood
     pm.NegativeBinomial("obs", growth, alpha=alpha, observed=confirmed)
 
-    prior_pred = pm.sample_prior_predictive()
+    prior_pred = pm.sample_prior_predictive(random_seed=RANDOM_SEED)
 ```
 
 ```{code-cell} ipython3
@@ -354,7 +354,7 @@ Similar to the prior predictive, we can also generate new data by repeatedly tak
 ```{code-cell} ipython3
 with model_exp3:
     # Draw sampels from posterior predictive
-    post_pred = pm.sample_posterior_predictive(trace_exp3.posterior)
+    post_pred = pm.sample_posterior_predictive(trace_exp3.posterior, random_seed=RANDOM_SEED)
 ```
 
 ```{code-cell} ipython3
@@ -426,7 +426,7 @@ with model_exp4:
     # the shape.
     pm.set_data({"t": np.arange(60), "confirmed": np.zeros(60, dtype="int")})
 
-    post_pred = pm.sample_posterior_predictive(trace_exp4.posterior)
+    post_pred = pm.sample_posterior_predictive(trace_exp4.posterior, random_seed=RANDOM_SEED)
 ```
 
 As we held data back before, we can now see how the predictions of the model
@@ -482,7 +482,7 @@ with pm.Model() as logistic_model:
         "obs", growth, alpha=pm.Gamma("alpha", mu=6, sigma=1), observed=confirmed_data
     )
 
-    prior_pred = pm.sample_prior_predictive()
+    prior_pred = pm.sample_prior_predictive(random_seed=RANDOM_SEED)
 ```
 
 ```{code-cell} ipython3
@@ -612,7 +612,9 @@ plt.tight_layout();
 
 ```{code-cell} ipython3
 with logistic_model:
-    pm.sample_posterior_predictive(trace_logistic_us, extend_inferencedata=True)
+    pm.sample_posterior_predictive(
+        trace_logistic_us, extend_inferencedata=True, random_seed=RANDOM_SEED
+    )
 ```
 
 ```{code-cell} ipython3
diff --git a/examples/case_studies/load_covid_data.py b/examples/case_studies/load_covid_data.py
@@ -0,0 +1,94 @@
+import pandas as pd
+import numpy as np
+
+
+def load_individual_timeseries(name):
+    base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series"
+    url = f"{base_url}/time_series_covid19_{name}_global.csv"
+    df = pd.read_csv(url, index_col=["Country/Region", "Province/State", "Lat", "Long"])
+    df["type"] = name.lower()
+    df.columns.name = "date"
+
+    df = (
+        df.set_index("type", append=True)
+        .reset_index(["Lat", "Long"], drop=True)
+        .stack()
+        .reset_index()
+        .set_index("date")
+    )
+    df.index = pd.to_datetime(df.index)
+    df.columns = ["country", "state", "type", "cases"]
+
+    # Move HK to country level
+    df.loc[df.state == "Hong Kong", "country"] = "Hong Kong"
+    df.loc[df.state == "Hong Kong", "state"] = np.nan
+
+    # Aggregate large countries split by states
+    df = pd.concat(
+        [
+            df,
+            (
+                df.loc[~df.state.isna()]
+                .groupby(["country", "date", "type"])
+                .sum()
+                .rename(index=lambda x: x + " (total)", level=0)
+                .reset_index(level=["country", "type"])
+            ),
+        ]
+    )
+    return df
+
+
+def load_data(drop_states=False, p_crit=0.05, filter_n_days_100=None):
+    df = load_individual_timeseries("confirmed")
+    df = df.rename(columns={"cases": "confirmed"})
+    if drop_states:
+        # Drop states for simplicity
+        df = df.loc[df.state.isnull()]
+
+    # Estimated critical cases
+    df = df.assign(critical_estimate=df.confirmed * p_crit)
+
+    # Compute days relative to when 100 confirmed cases was crossed
+    df.loc[:, "days_since_100"] = np.nan
+    for country in df.country.unique():
+        if not df.loc[(df.country == country), "state"].isnull().all():
+            for state in df.loc[(df.country == country), "state"].unique():
+                df.loc[(df.country == country) & (df.state == state), "days_since_100"] = np.arange(
+                    -len(
+                        df.loc[(df.country == country) & (df.state == state) & (df.confirmed < 100)]
+                    ),
+                    len(
+                        df.loc[
+                            (df.country == country) & (df.state == state) & (df.confirmed >= 100)
+                        ]
+                    ),
+                )
+        else:
+            df.loc[(df.country == country), "days_since_100"] = np.arange(
+                -len(df.loc[(df.country == country) & (df.confirmed < 100)]),
+                len(df.loc[(df.country == country) & (df.confirmed >= 100)]),
+            )
+
+    # Add recovered cases
+    #     df_recovered = load_individual_timeseries('Recovered')
+    #     df_r = df_recovered.set_index(['country', 'state'], append=True)[['cases']]
+    #     df_r.columns = ['recovered']
+
+    # Add deaths
+    df_deaths = load_individual_timeseries("deaths")
+    df_d = df_deaths.set_index(["country", "state"], append=True)[["cases"]]
+    df_d.columns = ["deaths"]
+
+    df = (
+        df.set_index(["country", "state"], append=True)
+        #             .join(df_r)
+        .join(df_d).reset_index(["country", "state"])
+    )
+
+    if filter_n_days_100 is not None:
+        # Select countries for which we have at least some information
+        countries = pd.Series(df.loc[df.days_since_100 >= filter_n_days_100].country.unique())
+        df = df.loc[lambda x: x.country.isin(countries)]
+
+    return df