Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
135 changes: 135 additions & 0 deletions src/policies/task_compare_vaccination_rates_by_age_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytask

from src.config import BLD
from src.config import SRC
from src.plotting.plotting import BLUE
from src.plotting.plotting import PLOT_SIZE
from src.plotting.plotting import style_plot
from src.policies.find_people_to_vaccinate import find_people_to_vaccinate


@pytask.mark.depends_on(
{
"states": BLD / "data" / "initial_states.parquet",
"vaccination_shares": BLD
/ "data"
/ "vaccinations"
/ "vaccination_shares_extended.pkl",
"params": BLD / "params.pkl",
"empirical_by_age": BLD
/ "data"
/ "vaccinations"
/ "vaccinations_by_age_group.pkl",
"vaccination_func": SRC / "policies" / "find_people_to_vaccinate.py",
}
)
@pytask.mark.produces(
BLD / "figures" / "vaccinations" / "simulated_vs_empirical_rates_by_age_group.pdf"
)
def task_compare_vaccination_rates_by_age_group(depends_on, produces):
vaccination_shares = pd.read_pickle(depends_on["vaccination_shares"])
empirical_by_age = pd.read_pickle(depends_on["empirical_by_age"]).T
params = pd.read_pickle(depends_on["params"])
states = pd.read_parquet(depends_on["states"])
states["ever_vaccinated"] = False

start = pd.Timestamp("2020-12-15")
end = vaccination_shares.index.max()
dates = pd.date_range(start, end)

vaccination_func = partial(
find_people_to_vaccinate,
vaccination_shares=vaccination_shares,
init_start=start,
params=params,
)

modeled_vaccination_shares = _simulate_vaccinations_alone(
states, dates, vaccination_func
)
fig = _plot_simulated_vs_empirical_vaccination_shares(
modeled_vaccination_shares=modeled_vaccination_shares,
empirical_by_age=empirical_by_age,
overall_vaccination_shares=vaccination_shares,
)
fig.savefig(produces)


def _simulate_vaccinations_alone(states, dates, vaccination_func):
"""Simulate just the vaccinations over a time frame.

Args:
states (pandas.DataFrame): states DataFrame, must have "age" and
"ever_vaccinated".
dates (iterable): list of dates.
vaccination_func (callable): vaccination function that expects the
arguments "states", "receives_vaccine" and "seed"

Returns:
pandas.DataFrame: columns are "overall", "12-17", "18-59", ">=60".
Each row is a date. Each cell contains the share of the respective
age group on the particular date.

"""
states = states.copy(deep=True)

out = pd.DataFrame(
np.nan, columns=["overall", "12-17", "18-59", ">=60"], index=dates
)

for seed, date in enumerate(dates):
states["date"] = date
to_vaccinate = vaccination_func(
receives_vaccine=pd.Series(False, index=states.index),
states=states,
seed=seed,
)
states.loc[to_vaccinate, "ever_vaccinated"] = True
out.loc[date, "overall"] = states["ever_vaccinated"].mean()
out.loc[date, "12-17"] = states.query("12 <= age <= 17")[
"ever_vaccinated"
].mean()
out.loc[date, "18-59"] = states.query("18 <= age <= 59")[
"ever_vaccinated"
].mean()
out.loc[date, ">=60"] = states.query("60 <= age")["ever_vaccinated"].mean()
return out


def _plot_simulated_vs_empirical_vaccination_shares(
modeled_vaccination_shares, empirical_by_age, overall_vaccination_shares
):
fig, axes = plt.subplots(
nrows=2, ncols=2, figsize=(2 * PLOT_SIZE[0], 2 * PLOT_SIZE[1])
)
for col, ax in zip(empirical_by_age, axes.flatten()):
modeled_vaccination_shares[col].plot(
label="simulated",
alpha=0.6,
ax=ax,
lw=2.5,
color=BLUE,
)
if col != "overall":
empirical_by_age[col].plot(
label="empirical", alpha=0.6, ax=ax, lw=2.5, color="k"
)
else:
overall_vaccination_shares["2021-01-01":].cumsum().plot(
label="empirical overall",
alpha=0.6,
ax=ax,
lw=2.5,
color="k",
)

ax.set_title(col)
ax.legend()
ax.set_ylim(-0.1, 1.1)
fig, ax = style_plot(fig, ax)
return fig
113 changes: 92 additions & 21 deletions src/prepare_data/task_prepare_vaccination_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytask
import seaborn as sns
Expand All @@ -10,6 +11,7 @@
from src.config import PLOT_SIZE
from src.config import PLOT_START_DATE
from src.config import POPULATION_GERMANY
from src.config import SRC
from src.plotting.plotting import style_plot


Expand All @@ -22,6 +24,77 @@
)


_BY_AGE_DEPENDENCIES = {
"june": SRC / "original_data" / "vaccinations_by_age_group" / "2021-06-14.xlsx",
"july": SRC / "original_data" / "vaccinations_by_age_group" / "2021-07-14.xlsx",
"aug1": SRC / "original_data" / "vaccinations_by_age_group" / "2021-08-01.xlsx",
"aug2": SRC / "original_data" / "vaccinations_by_age_group" / "2021-08-12.xlsx",
"aug3": SRC / "original_data" / "vaccinations_by_age_group" / "2021-08-23.xlsx",
}


@pytask.mark.depends_on(_BY_AGE_DEPENDENCIES)
@pytask.mark.produces(
{
"data": BLD / "data" / "vaccinations" / "vaccinations_by_age_group.pkl",
"fig": BLD / "figures" / "data" / "vaccinations_by_age_group.pdf",
}
)
def task_create_vaccination_rates_by_age_group(depends_on, produces):
june = pd.read_excel(
depends_on["june"], sheet_name="Impfquote_bis_einschl_14.06.21", header=[0, 2]
)
# no data at the federal level yet, so take the unweighted mean over the
# states for which data is available
june = june["Impfquote mindestens einmal geimpft *"].replace("-", np.nan).mean()
june = june.rename(index={"<18 Jahre": "12-17 Jahre"})
june.name = pd.Timestamp("2021-06-14")

july = pd.read_excel(
depends_on["july"], sheet_name="Impfquote_bis_einschl_14.07.21", header=[0, 2]
)
july = july.loc[17, "Impfquote mindestens einmal geimpft *"]
july = july.rename(index={"<18 Jahre": "12-17 Jahre"})
july.name = pd.Timestamp("2021-07-14")

aug1 = pd.read_excel(
depends_on["aug1"], sheet_name="Impfquote_bis_einschl_01.08.21", header=[0, 2]
)
aug1 = aug1.loc[17, "Impfquote mindestens einmal geimpft "]
aug1.index = [x.replace("*", "") for x in aug1.index]
aug1.name = pd.Timestamp("2021-08-01")

aug2 = pd.read_excel(
depends_on["aug2"], sheet_name="Impfquote_bis_einschl_12.08.21", header=[0, 2]
)
aug2 = aug2.loc[17, "Impfquote mindestens einmal geimpft "]
aug2.index = [x.replace("*", "") for x in aug2.index]
aug2.name = pd.Timestamp("2021-08-12")

aug3 = pd.read_excel(
depends_on["aug3"], sheet_name="Impfquote_bis_einschl_23.08.21", header=[0, 2]
)
aug3 = aug3.loc[17, "Impfquote mindestens einmal geimpft "]
aug3.index = [x.replace("*", "") for x in aug3.index]
aug3.name = pd.Timestamp("2021-08-23")

vacc_shares_by_age = pd.concat([june, july, aug1, aug2, aug3], axis=1) / 100
vacc_shares_by_age = vacc_shares_by_age.rename(
index={
"Gesamt": "overall",
"12-17 Jahre": "12-17",
"18-59 Jahre": "18-59",
"60+ Jahre": ">=60",
}
)
vacc_shares_by_age.to_pickle(produces["data"])

fig, ax = plt.subplots(figsize=PLOT_SIZE)
vacc_shares_by_age.T.plot(ax=ax)
fig, ax = style_plot(fig, ax)
fig.savefig(produces["fig"])


@pytask.mark.depends_on(
{
"data": BLD / "data" / "raw_time_series" / "vaccinations.xlsx",
Expand Down Expand Up @@ -61,38 +134,37 @@ def task_prepare_vaccination_data(depends_on, produces):
plt.close()

vaccination_shares = df["share_with_first_dose"].diff().dropna()
vaccination_shares.to_pickle(produces["vaccination_shares_raw"])

# extend data to 2020.
backward_dates = pd.date_range("2020-01-01", vaccination_shares.index.max())
vaccination_shares = vaccination_shares.reindex(backward_dates)
vaccination_shares = vaccination_shares.fillna(0)

# the first individuals to be vaccinated were nursing homes which are not
# in our synthetic data so we exclude the first 1% of vaccinations to
# be going to them.
vaccination_shares[vaccination_shares.cumsum() <= 0.01] = 0

vaccination_shares.to_pickle(produces["vaccination_shares_raw"])

# family physicians started vaccinating on April 6th (Tue after Easter)
# we assume that the number of vaccinations is constant to the weekday's
# mean when extrapolating into the future.
start_physicians = pd.Timestamp("2021-04-06")
after_start = vaccination_shares.loc[start_physicians:]

dayname_to_mean = after_start.groupby(after_start.index.day_name()).mean()
# for reproduction of vaccination scenarios save the weekday means
easter_until_july = vaccination_shares.loc[start_physicians:"2021-07-06"]
dayname_to_mean = easter_until_july.groupby(
easter_until_july.index.day_name()
).mean()
with open(produces["mean_vacc_share_per_day"], "w") as f:
yaml.dump(data=dayname_to_mean.to_dict(), stream=f)

start_date = vaccination_shares.index.max() + pd.Timedelta(days=1)
end_date = start_date + pd.Timedelta(weeks=12)
future_dates = pd.date_range(start_date, end_date)
future_day_names = future_dates.day_name()
future_values = future_day_names.to_series().replace(dayname_to_mean)
extension = pd.Series(future_values.values, index=future_dates)
# extrapolate into the future
last_month_avg = vaccination_shares[-28:].mean()

end_date = vaccination_shares.index.max() + pd.Timedelta(days=56)
extended_dates = pd.date_range("2020-03-01", end_date)
extended = vaccination_shares.copy(deep=True).reindex(extended_dates)
extended[: vaccination_shares.index.min()] = 0.0
extrapolated = extended.fillna(last_month_avg)

labeled = [
("raw data", vaccination_shares),
("extension", extension),
("extrapolated", extrapolated),
]
fig, ax = _plot_labeled_series(labeled)
ax.axvline(
Expand All @@ -105,9 +177,8 @@ def task_prepare_vaccination_data(depends_on, produces):
fig.savefig(produces["fig_vaccination_shares"])
plt.close()

extended = pd.concat([vaccination_shares, extension])
_test_extended(extended)
extended.to_pickle(produces["vaccination_shares_extended"])
_test_extrapolated(extrapolated)
extrapolated.to_pickle(produces["vaccination_shares_extended"])


def _clean_vaccination_data(df):
Expand Down Expand Up @@ -165,7 +236,7 @@ def _plot_labeled_series(labeled):
return fig, ax


def _test_extended(sr):
def _test_extrapolated(sr):
assert sr.index.is_monotonic, "index is not monotonic."
assert not sr.index.duplicated().any(), "Duplicate dates in Series."
assert (sr.index == pd.date_range(start=sr.index.min(), end=sr.index.max())).all()