Formatting output from multiple runs

For the ICCS paper I used a few functions to format the data to average out metrics from the runs, scale it up to population sizes etc...

Averaging the output from all runs in a given folder (non-spatial model)

def average_output_from_runs_v1(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # store the output in the dataframe
        storage_df = storage_df._append(data)
    # Calculate the average values of each output by each day
    storage_df = storage_df.groupby('time').mean()
    # Scale the outputs to the population size
    storage_df *= 100 / sample_size
    # Calculate the total number of new cases
    storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
    # return the averaged outputs
    return storage_df

Averaging the output from all runs in a given folder (spatial model)

def average_output_from_runs_v3(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # group the estimates from each district by time to get a population level overview
        data = data.groupby('time').sum()
        # calculate the number of new cases
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # store the averaged out metrics in the dataframe
        storage_df = storage_df._append(data)
    # scale estimates to population levels
    storage_df *= 100 / sample_size
    # group averaged out estimates by time
    storage_df = storage_df.groupby('time').mean()
    # return the dataframe
    return storage_df

Calculating the standard deviation in the output from all runs in a given folder (non-spatial model)

def std_output_from_runs_v1(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # store the output in the dataframe
        storage_df = storage_df._append(data)
    # Calculate the standard variation in values of each output by each day
    storage_df = storage_df.groupby('time').std()
    # Scale the outputs to the population size
    storage_df *= 100 / sample_size
    # Calculate the total number of new cases
    storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
    # return the averaged outputs
    return storage_df

Calculating the standard deviation in the output from all runs in a given folder (spatial model)

def std_output_from_runs_v3(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # group the estimates from each district by time to get a population level overview
        data = data.groupby('time').sum()
        # calculate the number of new cases
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # store the averaged out metrics in the dataframe
        storage_df = storage_df._append(data)
    # scale estimates to population levels
    storage_df *= 100 / sample_size
    # calculate standard deviation by time
    storage_df = storage_df.groupby('time').std()
    # return the dataframe
    return storage_df

Calculating the mean and standard deviation in the output for n runs in a given folder (Non-spatial model)

def sample_output_from_runs_v1(filepath, sample_size, number_of_samples):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in random.sample(os.listdir(filepath), number_of_samples):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # store the output in the dataframe
        storage_df = storage_df._append(data)
    # Calculate the total number of new cases
    storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
    # Calculate the average and variation in values of each output by each day
    mean_df = storage_df.groupby('time').mean()
    std_df = storage_df.groupby('time').std()
    # Scale the outputs to the population size
    mean_df *= 100 / sample_size
    std_df *= 100 / sample_size
    # return the outputs

    return mean_df, std_df

Calculating the mean and standard deviation in the output for n runs in a given folder (spatial model)

def sample_output_from_runs_v3(filepath, sample_size, number_of_samples):
# create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in random.sample(os.listdir(filepath), number_of_samples)::
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # group the estimates from each district by time to get a population level overview
        data = data.groupby('time').sum()
        # calculate the number of new cases
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # store the averaged out metrics in the dataframe
        storage_df = storage_df._append(data)
    # scale estimates to population levels
    storage_df *= 100 / sample_size
    # calculate mean and standard deviation by time
    mean_df = storage_df.groupby('time').mean()
    std_df = storage_df.groupby('time').std()
    # Scale the outputs
    mean_df *= 100 / sample_size
    std_df *= 100 / sample_size
    # return the dataframe
    
    return mean_df, std_df

Calculating the spatial spread of cases at a specific time from n runs in a given folder (spatial model)

def calculate_has_cases_in_space_with_samples(filepath, timemax, samples):
    # create a dataframe to store the output from
    storage_df = pd.DataFrame()
    # iterate over files 
    for file in random.sample(os.listdir(filepath), samples):
        # load in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # create a dummy variable to store each districts cumulative cases in the country
        data['cumulative_cases'] = [0] * len(data)
        # calculate the new cases in each district
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # create a dummy variable to store which district has cases in the country
        data['has_cases'] = [0] * len(data)
        # iterate over each district
        for district in data.myId.unique():
            # isolate the data to this district
            cases_in_district = data.loc[data['myId'] == district]
            # calculate the cumulative cases
            data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
            # look at which point in time the cumulative number of cases are greater than zero for 
            # this district and store this as 1's and 0's
            data.loc[cases_in_district.index, 'has_cases'] = \
                [1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
        # store this the dataframe
        storage_df = storage_df.append(data)
    # Get a list of district names
    storage_df['district_number'] = [int(dist[2:]) for dist in storage_df.myId.values]
    # filter the dataframe up to a specified time
    storage_df = storage_df.loc[storage_df['time'] <= timemax]
    # average out the estimates from each run
    storage_df = storage_df.groupby('myId').mean()
    # shuffle the districts so the estimates are ordered correctly 1, 2, 3, ..., 60
    storage_df = storage_df.sort_values('district_number')
    # return the dataframe
    return storage_df

Calculating the spatial spread of cases at a specific time for all runs in a given folder (spatial model)

def calculate_has_cases_in_space_with_samples(filepath, timemax, samples):
    # create a dataframe to store the output from
    storage_df = pd.DataFrame()
    # iterate over files 
    for file in os.listdir(filepath):
        # load in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # create a dummy variable to store each districts cumulative cases in the country
        data['cumulative_cases'] = [0] * len(data)
        # calculate the new cases in each district
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # create a dummy variable to store which district has cases in the country
        data['has_cases'] = [0] * len(data)
        # iterate over each district
        for district in data.myId.unique():
            # isolate the data to this district
            cases_in_district = data.loc[data['myId'] == district]
            # calculate the cumulative cases
            data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
            # look at which point in time the cumulative number of cases are greater than zero for 
            # this district and store this as 1's and 0's
            data.loc[cases_in_district.index, 'has_cases'] = \
                [1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
        # store this the dataframe
        storage_df = storage_df.append(data)
    # Get a list of district names
    storage_df['district_number'] = [int(dist[2:]) for dist in storage_df.myId.values]
    # filter the dataframe up to a specified time
    storage_df = storage_df.loc[storage_df['time'] <= timemax]
    # average out the estimates from each run
    storage_df = storage_df.groupby('myId').mean()
    # shuffle the districts so the estimates are ordered correctly 1, 2, 3, ..., 60
    storage_df = storage_df.sort_values('district_number')
    # return the dataframe
    return storage_df

Calculating the cumulative number of districts with covid cases from n runs in a given folder (spatial model)

def calculate_cumulative_spread_in_districts(filepath, samples):
    # create a dataframe to store the output in
    storageDF = pd.DataFrame()
    # iterate over the selected files
    for file in random.sample(os.listdir(filepath), samples):
        # load the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # create a dummy variable to store each districts cumulative cases in the country
        data['cumulative_cases'] = [0] * len(data)
        # calculate the new cases in each district
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # create a dummy variable to store which district has cases in the country
        data['has_cases'] = [0] * len(data)
        # iterate over each district
        for district in data.myId.unique():
            # isolate the data to this district
            cases_in_district = data.loc[data['myId'] == district]
            # calculate the cumulative cases
            data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
            # look at which point in time the cumulative number of cases are greater than zero for 
            # this district and store this as 1's and 0's
            data.loc[cases_in_district.index, 'has_cases'] = \
                [1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
        # store this the dataframe
        storageDF = storageDF.append(data)
    # calculate the number of infected districts by time
    n_infected_districts = storageDF.groupby('time').sum()
    # average this out by the number of samples
    cumulative_n_infected_dist = list(np.divide(n_infected_districts['has_cases'].values, samples))
    # return the list of cumulative districts with Covid cases
    return cumulative_n_infected_dist

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Formatting output from multiple runs

Averaging the output from all runs in a given folder (non-spatial model)

Averaging the output from all runs in a given folder (spatial model)

Calculating the standard deviation in the output from all runs in a given folder (non-spatial model)

Calculating the standard deviation in the output from all runs in a given folder (spatial model)

Calculating the mean and standard deviation in the output for n runs in a given folder (Non-spatial model)

Calculating the mean and standard deviation in the output for n runs in a given folder (spatial model)

Calculating the spatial spread of cases at a specific time from n runs in a given folder (spatial model)

Calculating the spatial spread of cases at a specific time for all runs in a given folder (spatial model)

Calculating the cumulative number of districts with covid cases from n runs in a given folder (spatial model)

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally