|
| 1 | +--- |
| 2 | +jupyter: |
| 3 | + jupytext: |
| 4 | + formats: ipynb,md |
| 5 | + text_representation: |
| 6 | + extension: .md |
| 7 | + format_name: markdown |
| 8 | + format_version: '1.3' |
| 9 | + jupytext_version: 1.14.4 |
| 10 | + kernelspec: |
| 11 | + display_name: env_qolmat_dev |
| 12 | + language: python |
| 13 | + name: env_qolmat_dev |
| 14 | +--- |
| 15 | + |
| 16 | +**This notebook aims to present the Qolmat repo through an example of a multivariate time series. |
| 17 | +In Qolmat, a few data imputation methods are implemented as well as a way to evaluate their performance.** |
| 18 | + |
| 19 | + |
| 20 | +First, import some useful librairies |
| 21 | + |
| 22 | +```python |
| 23 | +import warnings |
| 24 | +# warnings.filterwarnings('error') |
| 25 | +``` |
| 26 | + |
| 27 | +```python |
| 28 | +%reload_ext autoreload |
| 29 | +%autoreload 2 |
| 30 | + |
| 31 | +import pandas as pd |
| 32 | +import numpy as np |
| 33 | +import scipy |
| 34 | +import sklearn |
| 35 | +np.random.seed(1234) |
| 36 | +import pprint |
| 37 | +from matplotlib import pyplot as plt |
| 38 | +import matplotlib.image as mpimg |
| 39 | +import matplotlib.ticker as plticker |
| 40 | + |
| 41 | +tab10 = plt.get_cmap("tab10") |
| 42 | +plt.rcParams.update({'font.size': 18}) |
| 43 | + |
| 44 | +from typing import Optional |
| 45 | + |
| 46 | +from sklearn.linear_model import LinearRegression |
| 47 | +from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor |
| 48 | + |
| 49 | + |
| 50 | +import sys |
| 51 | +from qolmat.benchmark import comparator, missing_patterns |
| 52 | +from qolmat.benchmark.metrics import kl_divergence |
| 53 | +from qolmat.benchmark import metrics as qmetrics |
| 54 | +from qolmat.imputations import imputers |
| 55 | +from qolmat.utils import data, utils, plot |
| 56 | +# from qolmat.drawing import display_bar_table |
| 57 | + |
| 58 | +``` |
| 59 | + |
| 60 | +<!-- #region tags=[] --> |
| 61 | +### **I. Load data** |
| 62 | +<!-- #endregion --> |
| 63 | + |
| 64 | +The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00501/. |
| 65 | +This dataset only contains numerical vairables. |
| 66 | + |
| 67 | +```python |
| 68 | +df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120) |
| 69 | + |
| 70 | +# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"] |
| 71 | +# cols_to_impute = df_data.columns[df_data.isna().any()] |
| 72 | +cols_to_impute = ["TEMP", "PRES"] |
| 73 | + |
| 74 | +``` |
| 75 | + |
| 76 | +The dataset `Artificial` is designed to have a sum of a periodical signal, a white noise and some outliers. |
| 77 | + |
| 78 | +```python tags=[] |
| 79 | +df_data |
| 80 | +``` |
| 81 | + |
| 82 | +```python |
| 83 | +# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10) |
| 84 | +# cols_to_impute = ["signal"] |
| 85 | +``` |
| 86 | + |
| 87 | +Let's take a look at variables to impute. We only consider a station, Aotizhongxin. |
| 88 | +Time series display seasonalities (roughly 12 months). |
| 89 | + |
| 90 | +```python |
| 91 | +n_stations = len(df_data.groupby("station").size()) |
| 92 | +n_cols = len(cols_to_impute) |
| 93 | +``` |
| 94 | + |
| 95 | +```python |
| 96 | +df = df_data.loc["Aotizhongxin"].fillna(df_data.median()) |
| 97 | +``` |
| 98 | + |
| 99 | +```python |
| 100 | +df2 = df.copy() |
| 101 | +df2 += 1 |
| 102 | +``` |
| 103 | + |
| 104 | +```python |
| 105 | + |
| 106 | +``` |
| 107 | + |
| 108 | +# Density estimation |
| 109 | + |
| 110 | +```python |
| 111 | +n_samples = 1000 |
| 112 | +n_variables = 1 |
| 113 | +mu1 = np.array([0] * n_variables) |
| 114 | +S1 = .1 * np.ones(n_variables) + .9 * np.eye(n_variables) |
| 115 | +df1 = pd.DataFrame(np.random.multivariate_normal(mu1, S1, n_samples)) |
| 116 | + |
| 117 | +mu2 = np.array([0] * n_variables) + 1 |
| 118 | +S2 = .1 * np.ones(n_variables) + .9 * np.eye(n_variables) |
| 119 | +df2 = pd.DataFrame(np.random.multivariate_normal(mu2, S2, n_samples)) |
| 120 | +``` |
| 121 | + |
| 122 | +```python |
| 123 | +n_estimators = 100 |
| 124 | +estimator = sklearn.ensemble.RandomTreesEmbedding(n_estimators=n_estimators, max_depth=6) |
| 125 | +# y = pd.concat([pd.Series([False] * len(df1)), pd.Series([True] * len(df2))]) |
| 126 | +estimator.fit(df1) |
| 127 | +counts1 = qmetrics.density_from_rf(df1, estimator) |
| 128 | + |
| 129 | +counts_bis = qmetrics.density_from_rf(df1, estimator, df_est=df2) |
| 130 | +``` |
| 131 | + |
| 132 | +```python |
| 133 | +df1 |
| 134 | +``` |
| 135 | + |
| 136 | +```python |
| 137 | +counts_bis |
| 138 | +``` |
| 139 | + |
| 140 | +```python |
| 141 | +plt.plot(df1, counts1, ".") |
| 142 | +``` |
| 143 | + |
| 144 | +```python |
| 145 | +from scipy.stats import norm |
| 146 | + |
| 147 | + |
| 148 | +plt.plot(df1, counts1, ".") |
| 149 | +plt.gca().twinx() |
| 150 | +plt.plot(df2, counts_bis, ".", color=tab10(1)) |
| 151 | + |
| 152 | +plt.gca().twinx() |
| 153 | +x = np.linspace(norm.ppf(0.01), norm.ppf(0.99), 100) |
| 154 | +plt.plot(x, norm.pdf(x), 'r-', lw=5, alpha=0.6, label='norm pdf') |
| 155 | +plt.show() |
| 156 | +``` |
| 157 | + |
| 158 | +# KL div estimation |
| 159 | + |
| 160 | +```python |
| 161 | +n_samples = 10000 |
| 162 | +n_variables = 1 |
| 163 | +mu1 = np.array([0] * n_variables) |
| 164 | +S1 = .1 * np.ones(n_variables) + .9 * np.eye(n_variables) |
| 165 | +df1 = pd.DataFrame(np.random.multivariate_normal(mu1, S1, n_samples)) |
| 166 | + |
| 167 | +mu2 = np.array([0] * n_variables) |
| 168 | +mu2[0] = 1 |
| 169 | +S2 = .1 * np.ones(n_variables) + .9 * np.eye(n_variables) |
| 170 | +df2 = pd.DataFrame(np.random.multivariate_normal(mu2, S2, n_samples)) |
| 171 | +``` |
| 172 | + |
| 173 | +```python |
| 174 | +qmetrics.kl_divergence(df1, df2, df1.notna(), method="random_forest2") |
| 175 | +``` |
| 176 | + |
| 177 | +```python |
| 178 | +qmetrics.kl_divergence(df1, df2, df1.notna(), method="gaussian") |
| 179 | +``` |
| 180 | + |
| 181 | +```python |
| 182 | + |
| 183 | +``` |
0 commit comments