@@ -3115,11 +3115,327 @@ \subsection{Exercise 09}
31153115\item example with Fourier series, i.e. polynomial regression
31163116\end {itemize }
31173117
3118- no slides so far
3118+ \end { frame }
31193119
31203120
3121+ \begin {frame }[t]{Ex09: Bias Variance Trade-Off vs. Model Complexity}
3122+ %
3123+ \vspace {-1em}
3124+ %
3125+ total variance = (model bias$ ^2 $ ) + (model variance) + (data noise variance)
3126+ %
3127+ \begin {table }[]
3128+ \begin {tabular }{|l|l|l|}
3129+ \hline
3130+ true model & lowest bias$ ^2 $ & lowest variance\\\hline
3131+ low model complexity & high bias$ ^2 $ & low variance\\\hline
3132+ high model complexity & low bias$ ^2 $ & high variance\\\hline
3133+ optimum model complexity & \multicolumn {2}{l|}{optimum bias$ ^2 $ +variance}\\\hline
3134+ \end {tabular }
3135+ \end {table }
3136+ %
3137+ \begin {center }
3138+ \begin {tikzpicture }
3139+ \begin {axis }[
3140+ width=12cm,
3141+ height=6cm,
3142+ legend style={at={(0.015,0.65)}, anchor=north west},
3143+ xtick={-6,0,6},
3144+ xticklabels={(too) low, optimum, (too) high},
3145+ xlabel = {model complexity / \# of non-zero model parameters},
3146+ ytick={0,1},
3147+ yticklabels={low, high},
3148+ ylabel = {bias$ ^2 $ / variance},
3149+ ]
3150+ \addplot [domain=-6:6, C0, ultra thick, samples=32] {1-1/(1+exp(-x))};
3151+ \addplot [domain=-6:6, C1, ultra thick, samples=32] {1/(1+exp(-x))};
3152+ \addlegendentry {bias$ ^2 $ }
3153+ \addlegendentry {variance}
3154+ \end {axis }
3155+ \end {tikzpicture }
3156+ \end {center }
3157+ %
31213158\end {frame }
31223159
3160+
3161+
3162+
3163+
3164+ \begin {frame }[t]{Bias Variance Trade-Off vs. Regularisation}
3165+ %
3166+ \vspace {-1em}
3167+ %
3168+ total variance = (model bias$ ^2 $ ) + (model variance) + (data noise variance)
3169+ %
3170+ \begin {table }[]
3171+ \begin {tabular }{|l|l|l|}
3172+ \hline
3173+ true model & lowest bias$ ^2 $ & lowest variance\\\hline
3174+ high regularisation & high bias$ ^2 $ & low variance\\\hline
3175+ low regularisation & low bias$ ^2 $ & high variance\\\hline
3176+ optimum regularisation & \multicolumn {2}{l|}{optimum bias$ ^2 $ +variance}\\\hline
3177+ \end {tabular }
3178+ \end {table }
3179+ %
3180+ \vspace {-0.5em}
3181+ %
3182+ \begin {center }
3183+ \includegraphics [width=0.8\textwidth ]{../bias_variance_plots/bias_var_l2_regularisation.png}
3184+ \end {center }
3185+ %
3186+ \end {frame }
3187+
3188+
3189+
3190+
3191+
3192+
3193+
3194+
3195+ \begin {frame }[t]{Bias Variance Trade-Off: Intro Example}
3196+ %
3197+ \vspace {-1em}
3198+ %
3199+ $ \cdot $ ground truth model ($ N=1 +4 =5 $ features) with full column rank $ M \times N$ matrix, $ M>N$
3200+ $$ \bm {x}_1 = \frac {2\pi }{M} \cdot 0 ,\quad\bm {x}_2 = \frac {2\pi }{M} \cdot 1 ,\quad\dots ,\quad\bm {x}_M = \frac {2\pi }{M} \cdot (M-1 )$$
3201+ $$
3202+ \bm {X}_t =
3203+ \begin {bmatrix}
3204+ 1 & \cos (\bm {x}_1 ) & \sin (2 \bm {x}_1 ) & \cos (5 \bm {x}_1 ) & \cos (6 \bm {x}_1 ) \\
3205+ 1 & \cos (\bm {x}_2 ) & \sin (2 \bm {x}_2 ) & \cos (5 \bm {x}_2 ) & \cos (6 \bm {x}_2 )\\
3206+ \vdots & \vdots & \vdots & \vdots & \vdots \\
3207+ 1 & \cos (\bm {x}_M) & \sin (2 \bm {x}_M) & \cos (5 \bm {x}_M) & \cos (6 \bm {x}_M)\\
3208+ \end {bmatrix}\qquad
3209+ \bm {\beta }_t =
3210+ \begin {bmatrix}
3211+ 3 \\ 2 \\ 1 \\ \nicefrac {1}{2}\\ \nicefrac {1}{4}
3212+ \end {bmatrix}
3213+ \qquad
3214+ \bm {t} = \bm {X}_t \bm {\beta }_t
3215+ $$
3216+
3217+ $ \cdot $ mean-free, fixed variance noise $ \bm {n}$ $ \rightarrow $ $ l$ measurements, $ 1 \leq l \leq L$
3218+ $$ \bm {y}^{(l)} = \bm {t} + \bm {n}^{(l)}$$
3219+
3220+ $ \cdot $ OLS with a model design matrix $ \bm {X}$ and the $ l$ -th data set $ \bm {y}^{(l)}$
3221+ \begin {align* }
3222+ &\hat {\bm {\beta }}^{(l)} = \quad \,\,\,\, (\bm {X}^\mathrm {T} \bm {X})^{-1} \bm {X}^\mathrm {T} \bm {y}^{(l)}\\
3223+ \hat {\bm {y}}^{(l)} = \bm {X}\cdot &\hat {\bm {\beta }}^{(l)} = \bm {X}\cdot (\bm {X}^\mathrm {T} \bm {X})^{-1} \bm {X}^\mathrm {T} \bm {y}^{(l)}
3224+ \end {align* }
3225+
3226+ $ \cdot $ measurement: $ \bm {y}_m^{(l)}$ is $ m$ -th entry of vector $ \bm {y}^{(l)}$ ,\quad prediction: $ \hat {\bm {y}}_m^{(l)}$ is $ m$ -th entry of vector $ \hat {\bm {y}}^{(l)}$
3227+
3228+ \end {frame }
3229+
3230+ \begin {frame }[t]{Bias Variance Trade-Off: Math}
3231+ %
3232+ \vspace {-1em}
3233+ %
3234+ $ \cdot $ mean of all predictions $ \rightarrow $ 1st order raw moment
3235+ $$
3236+ \begin {bmatrix}
3237+ |\\ \tilde {\bm {y}}\\ |
3238+ \end {bmatrix}
3239+ =
3240+ \frac {1}{L}
3241+ \left (
3242+ \begin {bmatrix}
3243+ |\\ \hat {\bm {y}}^{(1)}\\ |
3244+ \end {bmatrix}
3245+ +\begin {bmatrix}
3246+ |\\ \hat {\bm {y}}^{(2)}\\ |
3247+ \end {bmatrix}
3248+ +
3249+ \dots
3250+ +
3251+ \begin {bmatrix}
3252+ |\\ \hat {\bm {y}}^{(L)}\\ |
3253+ \end {bmatrix}
3254+ \right )
3255+ $$
3256+ %
3257+ $ \cdot $ \underline {bias$ ^2 $ }: how much deviates mean of all predictions from the truth data $ \rightarrow $ 2nd order moment
3258+ $$
3259+ \begin {bmatrix}
3260+ |\\ \bm {e}_b\\ |
3261+ \end {bmatrix}=
3262+ \begin {bmatrix}
3263+ |\\ \bm {t}\\ |
3264+ \end {bmatrix}-
3265+ \begin {bmatrix}
3266+ |\\ \tilde {\bm {y}}\\ |
3267+ \end {bmatrix}
3268+ \qquad
3269+ \text {bias}^2 = \frac {1}{M}\bm {e}_b^\mathrm {T} \bm {e}_b = \frac {1}{M} \sum \limits _{m=1}^{M} (\bm {t}_m - \tilde {\bm {y}}_m)^2
3270+ $$
3271+ %
3272+ $ \cdot $ mean of squared deviations w.r.t. data $ \rightarrow $ 2nd order centralized moment
3273+ $$
3274+ \begin {bmatrix}
3275+ |\\ \bm {v}\\ |
3276+ \end {bmatrix}
3277+ =
3278+ \frac {1}{L}
3279+ \left (
3280+ \begin {bmatrix}
3281+ |\\ (\hat {\bm {y}}^{(1)}-\tilde {\bm {y}})^2 \\ |
3282+ \end {bmatrix}
3283+ +\begin {bmatrix}
3284+ |\\ (\hat {\bm {y}}^{(2)}-\tilde {\bm {y}})^2 \\ |
3285+ \end {bmatrix}
3286+ +
3287+ \dots
3288+ +
3289+ \begin {bmatrix}
3290+ |\\ (\hat {\bm {y}}^{(L)}-\tilde {\bm {y}})^2 \\ |
3291+ \end {bmatrix}
3292+ \right )
3293+ $$
3294+ %
3295+ $ \cdot $ \underline {variance}: we want a single number for outcome of $ \bm {v}$ $ \rightarrow $ 1st order raw moment (=mean)
3296+ $$
3297+ \text {variance} = \frac {1}{M} \sum \limits _{m=1}^{M} \bm {v}_m
3298+ $$
3299+
3300+ \end {frame }
3301+
3302+
3303+
3304+
3305+ \begin {frame }[t]{Bias Variance Trade-Off: Essence of Example}
3306+ %
3307+ \vspace {-1em}
3308+ %
3309+ \begin {center }
3310+ \begin {tikzpicture }
3311+ \begin {axis }[
3312+ width=12cm,
3313+ height=6cm,
3314+ legend style={at={(0.015,0.65)}, anchor=north west},
3315+ xtick={-6,0,6},
3316+ xticklabels={too simple, robust, too complex},
3317+ xlabel = {model complexity / \# of non-zero model parameters},
3318+ ytick={0,1},
3319+ yticklabels={low, high},
3320+ ylabel = {bias$ ^2 $ / variance},
3321+ ]
3322+ \addplot [domain=-6:6, C0, ultra thick, samples=32] {1-1/(1+exp(-x))};
3323+ \addplot [domain=-6:6, C1, ultra thick, samples=32] {1/(1+exp(-x))};
3324+ \addlegendentry {bias$ ^2 $ }
3325+ \addlegendentry {variance}
3326+ \end {axis }
3327+ \end {tikzpicture }
3328+ \end {center }
3329+ %
3330+ \begin {align* }
3331+ \bm {X} =
3332+ \begin {bmatrix }
3333+ 1 & \bm {x}_1\\
3334+ 1 & \bm {x}_2\\
3335+ \vdots & \vdots \\
3336+ 1 & \bm {x}_M
3337+ \end {bmatrix }
3338+ %
3339+ \qquad\qquad
3340+ \bm {X} =
3341+ \begin {bmatrix }
3342+ 1 & \cos (\bm {x}_1) & \sin (2\bm {x}_1)\\
3343+ 1 & \cos (\bm {x}_2) & \sin (2\bm {x}_2)\\
3344+ \vdots & \vdots & \vdots \\
3345+ 1 & \cos (\bm {x}_M) & \sin (2\bm {x}_M)
3346+ \end {bmatrix }
3347+ %
3348+ \qquad\qquad
3349+ \bm {X}=?
3350+ \end {align* }
3351+
3352+ \end {frame }
3353+
3354+
3355+
3356+
3357+ \begin {frame }[t]{Example: True Data}
3358+ \centering
3359+ \includegraphics [width=0.8\textwidth ]{../bias_variance_plots/true_data.png}
3360+ \end {frame }
3361+
3362+ \begin {frame }[t]{Example: True Model}
3363+ \centering
3364+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/true_model.png}
3365+ \end {frame }
3366+
3367+ \begin {frame }[t]{Example: Model Too Simple}
3368+ \centering
3369+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/too_simple_model.png}
3370+ \end {frame }
3371+
3372+ \begin {frame }[t]{Example: Model Too Complex}
3373+ \centering
3374+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/too_complex_model.png}
3375+ \end {frame }
3376+
3377+ \begin {frame }[t]{Example: Robust Model}
3378+ \centering
3379+ \includegraphics [width=1\textwidth ]{../bias_variance_plots/robust_model.png}
3380+ \end {frame }
3381+
3382+
3383+ \begin {frame }[t]{Empirical Correlation Coefficient $ R^2 $ Between $ \mathbf {y}$ and $ \hat {\mathbf {y}}$ }
3384+ \vspace {-1em}
3385+ $ \cdot $ measured $ \bm {y}^{(l)}$ , predicted $ \hat {\bm {y}}^{(l)}$
3386+
3387+ $ \cdot $ we calculate all for the $ l$ -th data set, but we omit index $ l$ :
3388+
3389+ - Sum of Squares \textbf {Error } (SS\textbf {E })
3390+ $$ \mathrm {SSE} = \sum _{m=1}^{M} (\bm {y}_m - \hat {\bm {y}}_m)^2 = (\bm {y} - \bm {X}\hat {\bm {\beta }})^\mathrm {T} (\bm {y} - \bm {X}\hat {\bm {\beta }})$$
3391+
3392+ - mean of measured data
3393+ $$ \bar {{y}} = \frac {1}{M} \sum _{m=1}^{M} \bm {y}_m$$
3394+
3395+ - Sum of Squares \textbf {Total } (SS\textbf {T })
3396+ $$ \mathrm {SST} = \sum _{m=1}^{M} (\bm {y}_m - \bar {{y}})^2 $$
3397+
3398+ - Sum of Squares (due to) \textbf {Regression } (SS\textbf {R })
3399+ $$ \mathrm {SSR} = \sum _{m=1}^{M} (\hat {\bm {y}}_m - \bar {{y}})^2 $$
3400+
3401+ $$ \mathrm {SST} = \mathrm {SSR} + \mathrm {SSE}$$
3402+
3403+ \end {frame }
3404+
3405+ \begin {frame }[t]{Empirical Correlation Coefficient $ R^2 $ Between $ \mathbf {y}$ and $ \hat {\mathbf {y}}$ }
3406+ \vspace {-1em}
3407+ $$ \mathrm {SST} = \mathrm {SSR} + \mathrm {SSE}$$
3408+
3409+ $ \cdot $ empirical correlation coefficient or coefficient of determination $ 0 \leq R^2 \leq 1 $
3410+
3411+ $$ R^2 = \frac {\mathrm {SSR}}{\mathrm {SST}} = \frac {\mathrm {SST}-\mathrm {SSE}}{\mathrm {SST}} = 1 ^2 - \frac {\mathrm {SSE}}{\mathrm {SST}}$$
3412+
3413+ $ \cdot $ normalise for independence w.r.t. number of data samples $ M$ and number of features $ N$
3414+ $$ R_\text {adjusted}^2 = 1 ^2 - \frac {\frac {\mathrm {SSE}}{M-N}}{\frac {\mathrm {SST}}{M-1}}$$
3415+
3416+ $ \cdot $ $ R_\text {adjusted}^2 $ holds for models with intercept!
3417+
3418+ \vspace {1em}
3419+
3420+ $ \cdot $ hence: measured $ \bm {y}^{(l)}$ , model design matrix $ \bm {X}$ , fitted $ \hat {\bm {\beta }}^{(l)}$ , predicted $ \hat {\bm {y}}^{(l)}$ $ \rightarrow $ $ R_\text {adjusted}^{2,(l)}$
3421+
3422+ \end {frame }
3423+
3424+
3425+
3426+
3427+
3428+
3429+
3430+
3431+
3432+
3433+
3434+
3435+
3436+
3437+
3438+
31233439\subsection {Exercise 10 }
31243440\begin {frame }{Ex 10: Gradient Descent}
31253441
0 commit comments