-
Notifications
You must be signed in to change notification settings - Fork 3
Formatting output from multiple runs
RobertManningSmith edited this page Jun 14, 2024
·
7 revisions
For the ICCS paper I used a few functions to format the data to average out metrics from the runs, scale it up to population sizes etc...
def average_output_from_runs_v1(filepath, sample_size):
# create an empty dataframe to store the output in
storage_df = pd.DataFrame()
# Iterate over each file in the filepath
for file in os.listdir(filepath):
# read in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# store the output in the dataframe
storage_df = storage_df._append(data)
# Calculate the average values of each output by each day
storage_df = storage_df.groupby('time').mean()
# Scale the outputs to the population size
storage_df *= 100 / sample_size
# Calculate the total number of new cases
storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
# return the averaged outputs
return storage_dfdef average_output_from_runs_v3(filepath, sample_size):
# create an empty dataframe to store the output in
storage_df = pd.DataFrame()
# Iterate over each file in the filepath
for file in os.listdir(filepath):
# read in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# group the estimates from each district by time to get a population level overview
data = data.groupby('time').sum()
# calculate the number of new cases
data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
# store the averaged out metrics in the dataframe
storage_df = storage_df._append(data)
# scale estimates to population levels
storage_df *= 100 / sample_size
# group averaged out estimates by time
storage_df = storage_df.groupby('time').mean()
# return the dataframe
return storage_dfCalculating the standard deviation in the output from all runs in a given folder (non-spatial model)
def std_output_from_runs_v1(filepath, sample_size):
# create an empty dataframe to store the output in
storage_df = pd.DataFrame()
# Iterate over each file in the filepath
for file in os.listdir(filepath):
# read in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# store the output in the dataframe
storage_df = storage_df._append(data)
# Calculate the standard variation in values of each output by each day
storage_df = storage_df.groupby('time').std()
# Scale the outputs to the population size
storage_df *= 100 / sample_size
# Calculate the total number of new cases
storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
# return the averaged outputs
return storage_dfdef std_output_from_runs_v3(filepath, sample_size):
# create an empty dataframe to store the output in
storage_df = pd.DataFrame()
# Iterate over each file in the filepath
for file in os.listdir(filepath):
# read in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# group the estimates from each district by time to get a population level overview
data = data.groupby('time').sum()
# calculate the number of new cases
data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
# store the averaged out metrics in the dataframe
storage_df = storage_df._append(data)
# scale estimates to population levels
storage_df *= 100 / sample_size
# calculate standard deviation by time
storage_df = storage_df.groupby('time').std()
# return the dataframe
return storage_dfCalculating the mean and standard deviation in the output for n runs in a given folder (Non-spatial model)
def sample_output_from_runs_v1(filepath, sample_size, number_of_samples):
# create an empty dataframe to store the output in
storage_df = pd.DataFrame()
# Iterate over each file in the filepath
for file in random.sample(os.listdir(filepath), number_of_samples):
# read in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# store the output in the dataframe
storage_df = storage_df._append(data)
# Calculate the total number of new cases
storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
# Calculate the average and variation in values of each output by each day
mean_df = storage_df.groupby('time').mean()
std_df = storage_df.groupby('time').std()
# Scale the outputs to the population size
mean_df *= 100 / sample_size
std_df *= 100 / sample_size
# return the outputs
return mean_df, std_dfCalculating the mean and standard deviation in the output for n runs in a given folder (spatial model)
def sample_output_from_runs_v3(filepath, sample_size, number_of_samples):
# create an empty dataframe to store the output in
storage_df = pd.DataFrame()
# Iterate over each file in the filepath
for file in random.sample(os.listdir(filepath), number_of_samples)::
# read in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# group the estimates from each district by time to get a population level overview
data = data.groupby('time').sum()
# calculate the number of new cases
data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
# store the averaged out metrics in the dataframe
storage_df = storage_df._append(data)
# scale estimates to population levels
storage_df *= 100 / sample_size
# calculate mean and standard deviation by time
mean_df = storage_df.groupby('time').mean()
std_df = storage_df.groupby('time').std()
# Scale the outputs
mean_df *= 100 / sample_size
std_df *= 100 / sample_size
# return the dataframe
return mean_df, std_dfCalculating the spatial spread of cases at a specific time from n runs in a given folder (spatial model)
def calculate_has_cases_in_space_with_samples(filepath, timemax, samples):
# create a dataframe to store the output from
storage_df = pd.DataFrame()
# iterate over files
for file in random.sample(os.listdir(filepath), samples):
# load in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# create a dummy variable to store each districts cumulative cases in the country
data['cumulative_cases'] = [0] * len(data)
# calculate the new cases in each district
data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
# create a dummy variable to store which district has cases in the country
data['has_cases'] = [0] * len(data)
# iterate over each district
for district in data.myId.unique():
# isolate the data to this district
cases_in_district = data.loc[data['myId'] == district]
# calculate the cumulative cases
data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
# look at which point in time the cumulative number of cases are greater than zero for
# this district and store this as 1's and 0's
data.loc[cases_in_district.index, 'has_cases'] = \
[1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
# store this the dataframe
storage_df = storage_df.append(data)
# Get a list of district names
storage_df['district_number'] = [int(dist[2:]) for dist in storage_df.myId.values]
# filter the dataframe up to a specified time
storage_df = storage_df.loc[storage_df['time'] <= timemax]
# average out the estimates from each run
storage_df = storage_df.groupby('myId').mean()
# shuffle the districts so the estimates are ordered correctly 1, 2, 3, ..., 60
storage_df = storage_df.sort_values('district_number')
# return the dataframe
return storage_dfCalculating the spatial spread of cases at a specific time for all runs in a given folder (spatial model)
def calculate_has_cases_in_space_with_samples(filepath, timemax, samples):
# create a dataframe to store the output from
storage_df = pd.DataFrame()
# iterate over files
for file in os.listdir(filepath):
# load in the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# create a dummy variable to store each districts cumulative cases in the country
data['cumulative_cases'] = [0] * len(data)
# calculate the new cases in each district
data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
# create a dummy variable to store which district has cases in the country
data['has_cases'] = [0] * len(data)
# iterate over each district
for district in data.myId.unique():
# isolate the data to this district
cases_in_district = data.loc[data['myId'] == district]
# calculate the cumulative cases
data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
# look at which point in time the cumulative number of cases are greater than zero for
# this district and store this as 1's and 0's
data.loc[cases_in_district.index, 'has_cases'] = \
[1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
# store this the dataframe
storage_df = storage_df.append(data)
# Get a list of district names
storage_df['district_number'] = [int(dist[2:]) for dist in storage_df.myId.values]
# filter the dataframe up to a specified time
storage_df = storage_df.loc[storage_df['time'] <= timemax]
# average out the estimates from each run
storage_df = storage_df.groupby('myId').mean()
# shuffle the districts so the estimates are ordered correctly 1, 2, 3, ..., 60
storage_df = storage_df.sort_values('district_number')
# return the dataframe
return storage_dfCalculating the cumulative number of districts with covid cases from n runs in a given folder (spatial model)
def calculate_cumulative_spread_in_districts(filepath, samples):
# create a dataframe to store the output in
storageDF = pd.DataFrame()
# iterate over the selected files
for file in random.sample(os.listdir(filepath), samples):
# load the data
data = pd.read_csv(filepath + file, delimiter='\t')
# drop the unnamed column
data = data.drop('Unnamed: 10', axis=1)
# create a dummy variable to store each districts cumulative cases in the country
data['cumulative_cases'] = [0] * len(data)
# calculate the new cases in each district
data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
# create a dummy variable to store which district has cases in the country
data['has_cases'] = [0] * len(data)
# iterate over each district
for district in data.myId.unique():
# isolate the data to this district
cases_in_district = data.loc[data['myId'] == district]
# calculate the cumulative cases
data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
# look at which point in time the cumulative number of cases are greater than zero for
# this district and store this as 1's and 0's
data.loc[cases_in_district.index, 'has_cases'] = \
[1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
# store this the dataframe
storageDF = storageDF.append(data)
# calculate the number of infected districts by time
n_infected_districts = storageDF.groupby('time').sum()
# average this out by the number of samples
cumulative_n_infected_dist = list(np.divide(n_infected_districts['has_cases'].values, samples))
# return the list of cumulative districts with Covid cases
return cumulative_n_infected_dist