|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | + |
| 4 | + |
| 5 | +def load_individual_timeseries(name): |
| 6 | + base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series" |
| 7 | + url = f"{base_url}/time_series_covid19_{name}_global.csv" |
| 8 | + df = pd.read_csv(url, index_col=["Country/Region", "Province/State", "Lat", "Long"]) |
| 9 | + df["type"] = name.lower() |
| 10 | + df.columns.name = "date" |
| 11 | + |
| 12 | + df = ( |
| 13 | + df.set_index("type", append=True) |
| 14 | + .reset_index(["Lat", "Long"], drop=True) |
| 15 | + .stack() |
| 16 | + .reset_index() |
| 17 | + .set_index("date") |
| 18 | + ) |
| 19 | + df.index = pd.to_datetime(df.index) |
| 20 | + df.columns = ["country", "state", "type", "cases"] |
| 21 | + |
| 22 | + # Move HK to country level |
| 23 | + df.loc[df.state == "Hong Kong", "country"] = "Hong Kong" |
| 24 | + df.loc[df.state == "Hong Kong", "state"] = np.nan |
| 25 | + |
| 26 | + # Aggregate large countries split by states |
| 27 | + df = pd.concat( |
| 28 | + [ |
| 29 | + df, |
| 30 | + ( |
| 31 | + df.loc[~df.state.isna()] |
| 32 | + .groupby(["country", "date", "type"]) |
| 33 | + .sum() |
| 34 | + .rename(index=lambda x: x + " (total)", level=0) |
| 35 | + .reset_index(level=["country", "type"]) |
| 36 | + ), |
| 37 | + ] |
| 38 | + ) |
| 39 | + return df |
| 40 | + |
| 41 | + |
| 42 | +def load_data(drop_states=False, p_crit=0.05, filter_n_days_100=None): |
| 43 | + df = load_individual_timeseries("confirmed") |
| 44 | + df = df.rename(columns={"cases": "confirmed"}) |
| 45 | + if drop_states: |
| 46 | + # Drop states for simplicity |
| 47 | + df = df.loc[df.state.isnull()] |
| 48 | + |
| 49 | + # Estimated critical cases |
| 50 | + df = df.assign(critical_estimate=df.confirmed * p_crit) |
| 51 | + |
| 52 | + # Compute days relative to when 100 confirmed cases was crossed |
| 53 | + df.loc[:, "days_since_100"] = np.nan |
| 54 | + for country in df.country.unique(): |
| 55 | + if not df.loc[(df.country == country), "state"].isnull().all(): |
| 56 | + for state in df.loc[(df.country == country), "state"].unique(): |
| 57 | + df.loc[(df.country == country) & (df.state == state), "days_since_100"] = np.arange( |
| 58 | + -len( |
| 59 | + df.loc[(df.country == country) & (df.state == state) & (df.confirmed < 100)] |
| 60 | + ), |
| 61 | + len( |
| 62 | + df.loc[ |
| 63 | + (df.country == country) & (df.state == state) & (df.confirmed >= 100) |
| 64 | + ] |
| 65 | + ), |
| 66 | + ) |
| 67 | + else: |
| 68 | + df.loc[(df.country == country), "days_since_100"] = np.arange( |
| 69 | + -len(df.loc[(df.country == country) & (df.confirmed < 100)]), |
| 70 | + len(df.loc[(df.country == country) & (df.confirmed >= 100)]), |
| 71 | + ) |
| 72 | + |
| 73 | + # Add recovered cases |
| 74 | + # df_recovered = load_individual_timeseries('Recovered') |
| 75 | + # df_r = df_recovered.set_index(['country', 'state'], append=True)[['cases']] |
| 76 | + # df_r.columns = ['recovered'] |
| 77 | + |
| 78 | + # Add deaths |
| 79 | + df_deaths = load_individual_timeseries("deaths") |
| 80 | + df_d = df_deaths.set_index(["country", "state"], append=True)[["cases"]] |
| 81 | + df_d.columns = ["deaths"] |
| 82 | + |
| 83 | + df = ( |
| 84 | + df.set_index(["country", "state"], append=True) |
| 85 | + # .join(df_r) |
| 86 | + .join(df_d).reset_index(["country", "state"]) |
| 87 | + ) |
| 88 | + |
| 89 | + if filter_n_days_100 is not None: |
| 90 | + # Select countries for which we have at least some information |
| 91 | + countries = pd.Series(df.loc[df.days_since_100 >= filter_n_days_100].country.unique()) |
| 92 | + df = df.loc[lambda x: x.country.isin(countries)] |
| 93 | + |
| 94 | + return df |
0 commit comments