diff --git a/Dissertate.cls b/Dissertate.cls index d710173..9406d68 100644 --- a/Dissertate.cls +++ b/Dissertate.cls @@ -212,6 +212,11 @@ \definecolor{SchoolColor}{rgb}{0.6471, 0.1098, 0.1882} % Crimson \definecolor{chaptergrey}{rgb}{0.6471, 0.1098, 0.1882} % for chapter numbers +\definecolor{matching1}{HTML}{465362} +\definecolor{matching2}{HTML}{82A3A1} +\definecolor{matching3}{HTML}{9FC490} +\definecolor{matching4}{HTML}{C0DFA1} + \hypersetup{ colorlinks, citecolor=SchoolColor, diff --git a/NeuralNetworks.tex b/NeuralNetworks.tex new file mode 100644 index 0000000..c8538d3 --- /dev/null +++ b/NeuralNetworks.tex @@ -0,0 +1,249 @@ +\chapter{Neural Network} +Artificial neural networks (ANN) or connectionist systems are computing systems vaguely inspired by the biological neural networks that constitute animal brains. +\href{https://en.wikipedia.org/wiki/Artificial_neural_network}{wikipedia} +TODO: lot of stuff + +\section{Architecture} +TODO: Perceptron, Fully-Connected, Convolutional, Autoencoder, LSTM etc. + +\newcommand{\activationDec}[6]{ +\subsection{#1} +#2 +\begin{samepage} +\begin{tabular}{ l | r } + \Centering definition & \Centering{derivative} \\\hline + $f(x) = #3$ & $f'(x)=#4$\\ +\end{tabular}\\ +\begin{tikzpicture} + \begin{axis}[ + xmin=-3, xmax=3, + ymin=-3, ymax=3, + ytick={-3,-2,...,3}, + cycle list name=color-scheme, + axis lines = left, + extra x ticks={0}, + extra y ticks={0}, + extra tick style={grid=major}, + every axis plot/.append style={ultra thick} + ] + %Here the blue parabloa is defined + #6 + \addlegendentry{$f'(x)$} + %Below the red parabola is defined + #5 + \addlegendentry{$f(x)$} + + \end{axis} +\end{tikzpicture} +\end{samepage} +\filbreak +} +\newcommand{\activation}[5]{ + \activationDec{#1}{}{#2}{#3}{#4}{#5} +} + +\newcommand{\activationsimple}[5]{ + \activation{#1}{#2}{#3}{ + \addplot [ + domain=-3:3, + samples=100, + color=SchoolColor + ] + {(#4)}; + } + { + \addplot [ + domain=-3:3, + samples=100, + color=matching1 + ] + {(#5)}; + } +} + +\activationsimple{Identity}{x}{1}{x}{1} +\activation{Heav-Step function} + {\begin{cases} + 1 & x \geq 0 \\ + 0 & else + \end{cases}} + { + 0 + } + { + \addplot [ + domain=-3:0, + samples=100, + color=SchoolColor + ] + {0}; + \addplot [ + domain=0:3, + samples=100, + color=matching1 + ] + {1}; + } + { + \addplot [ + domain=-3:3, + samples=100, + color=matching1 + ] + {0}; + } +\activationsimple{Logistic} + {\frac{1}{1+e^{-x}}}{\frac{e^{-x}}{(e^{-x} + 1)^2}=f(x)(1-f(x))} + {1/(1+e^(-x)}{e^(-x)/((e^(-x) + 1)^2)} +\activation{ReLu} + {max(0,x)} + { + \begin{cases} + 1 & x \geq 0 \\ + 0 & else + \end{cases} + } + { + \addplot [ + domain=-3:3, + samples=100, + color=SchoolColor + ] + {max(0,x)}; + } + { + \addplot [ + domain=-3:0, + samples=100, + color=matching1 + ] + {0}; + \addplot [ + domain=0:3, + samples=100, + color=matching1 + ] + {1}; + } +\activation{Leaky ReLu} + {\begin{cases} + x & x \geq 0 \\ + 0.01x & else + \end{cases}} + { + \begin{cases} + 1 & x \geq 0 \\ + 0.01 & else + \end{cases} + } + { + \addplot [ + domain=-3:0, + samples=100, + color=SchoolColor + ] + {0.01*x}; + \addplot [ + domain=0:3, + samples=100, + color=SchoolColor + ] + {x}; + } + { + \addplot [ + domain=-3:0, + samples=100, + color=matching1 + ] + {0.01}; + \addplot [ + domain=0:3, + samples=100, + color=matching1 + ] + {1}; + } +\activationsimple{Tanh} + {tanh(x)}{1-tanh(x)^2} + {tanh(x)}{1-tanh(x)^2} +\activationDec{ELU} + {the examples are plottet for $\alpha=0.7$.\\} + {\begin{cases} + \alpha(e^{x}-1) & x \geq 0 \\ + x & else + \end{cases}} + {\begin{cases} + 1 & x \geq 0 \\ + \alpha*e^{x}=f(x) + \alpha & else + \end{cases} + } + { + \addplot [ + domain=-3:0, + samples=100, + color=SchoolColor + ] + {0.7*(e^(x)-1)}; + \addplot [ + domain=0:3, + samples=100, + color=SchoolColor + ] + {x}; + } + { + \addplot [ + domain=-3:0, + samples=100, + color=matching1 + ] + {0.7*(e^(x))}; + \addplot [ + domain=0:3, + samples=100, + color=matching1 + ] + {1}; + } +\activationDec{SELU} + { + Scaled Exponential Linear Unit (SELU) is the Exponetial Linear Unit (ELU) activation function, where $\lambda$ and $\alpha$ have been fixed to $1.0507$ and $1.67326$. Neural Networks using the SELU form Self-Normalizing Neural Networks. See the \href{https://arxiv.org/abs/1706.02515}{paper} for more information.\\ + } + {\lambda \begin{cases} + \alpha(e^{x}-1) & x \geq 0 \\ + x & else + \end{cases}} + {\lambda \begin{cases} + 1 & x \geq 0 \\ + \alpha*e^{x}=f(x) + \alpha & else + \end{cases} + } + { + \addplot [ + domain=-3:0, + samples=100, + color=SchoolColor + ] + {1.0507*1.67326*(e^(x)-1)}; + \addplot [ + domain=0:3, + samples=100, + color=SchoolColor + ] + {1.0507*x}; + } + { + \addplot [ + domain=-3:0, + samples=100, + color=matching1 + ] + {1.0507*1.67326*(e^(x))}; + \addplot [ + domain=0:3, + samples=100, + color=matching1 + ] + {1.0507}; + } \ No newline at end of file diff --git a/Statistics.tex b/Statistics.tex index c1c00ca..3a93341 100644 --- a/Statistics.tex +++ b/Statistics.tex @@ -8,7 +8,14 @@ \chapter{Statistics} \href{https://en.wikipedia.org/wiki/Statistics}{wikipedia} \section{Probability} -TODO: Probability (general + simple), CDF, Variance, Markov-Property, etc. +TODO: Probability (general + simple), CDF, Variance, Markov-Property, stationarity, etc. + + +\subsection{Statistic} +A statistic is a function of a sample where the function itself is independent of the sample's distribution; that is, the function can be stated before realization of the data. The term statistic is used both for the function and for the value of the function on a given sample. +\href{https://en.wikipedia.org/wiki/Statistic}{wikipedia} + + \subsection{$L_p$-Space for Random-Variables} The $L_p$-Norm for Random-Variables $X$, where $\mathbb{E}|X|^p < \infty$, is defined through: \begin{align*} @@ -63,7 +70,67 @@ \subsection{Empirical Distribution} \href{http://www.stat.umn.edu/geyer/5102/slides/s1.pdf}{lecture} \section{Estimation} -TODO: ML, Score-Function, biased/unbiased, Cramér–Rao bound, confidence-interval +TODO: biased/unbiased, consistent, sufficient, Cramér–Rao bound, confidence-interval + +\subsection{Estimator} +Examples: +\begin{enumerate} + \item \textit{point estimation:}\\ The application of a point estimator (a statistic) to the data to obtain a point estimate. In Machine-Learning, estimating the parameters of neural networks is usually done via (a multidimensional) point-estimation. \href{https://en.wikipedia.org/wiki/Point_estimation}{wikipedia} + \item \textit{Interval estimation:}\\ interval estimation is the use of sample data to calculate an interval of plausible values of an unknown population parameter.\href{https://en.wikipedia.org/wiki/Interval_estimation}{wikipedia} + \item \textit{clustering:}\\ Grouping data into sets of similiar objects. + \item \textit{classification:}\\ Assigning Categories to data-objects. +\end{enumerate} + +\subsection{Score Function} +\begin{enumerate} + \item indicates how sensitive a likelihood function $\displaystyle {\mathcal {L}}(\theta ;X)$ is to its parameter $\theta$. + \item it is defined as: + \begin{align*} + u_\theta(x)=\frac{\partial}{\partial \theta}ln \mathcal{L}(\theta \mid x)\, + \end{align*} + where $\mathcal{L}$ is a likelihood-function. +\end{enumerate} +\href{https://en.wikipedia.org/wiki/Score_(statistics)}{wikipedia} + +\subsection{Likelihood} +\begin{enumerate} + \item A likelihood function (often simply the likelihood) is a function of the parameters of a statistical model, given specific observed data + \item Common definitions: + \begin{enumerate} + \item \textit{parameterized model}\\ + Given a parameterized family of probability density functions (or probability mass functions in the case of discrete distributions) $x\mapsto f(x\mid\theta)$, where $\theta$ is the parameter, the likelihood function is + \begin{align*} + \theta\mapsto f(x\mid\theta) + \end{align*} + , written + \begin{align*} + \mathcal{L}(\theta \mid x)=f(x\mid\theta)\, + \end{align*} + where $x$ is the observed outcome of an experiment. + + \item \textit{In general}\\ + The likelihood function is this density interpreted as a function of the parameter (possibly a vector), not of the possible outcomes. This provides a likelihood function for any probability model with all distributions, whether discrete, absolutely continuous, a mixture or something else. + \end{enumerate} + \item \textit{Log-likelihood:}\\ + It's usually convenient to work with the log-likelihood, especially if multiple, indipendent random variables are involed. +\end{enumerate} +\href{https://en.wikipedia.org/wiki/Likelihood_function}{wikipedia} + + +\subsection{Maximum-Likelihood Estimator} +\begin{enumerate} + \item Maximum likelihood estimation (MLE) attempts to find the parameter values that maximize the likelihood function, given the observations. + \item \textit{frequentist inference:} MLE is one of several methods to get estimates of parameters without using prior distributions. +\end{enumerate} +Some properties: +\begin{enumerate} + \item \textit{Consistency:} the sequence of MLEs converges in probability to the value being estimated. + \item \textit{Efficiency:} it achieves the Cramér–Rao lower bound when the sample size tends to infinity. This means that no consistent estimator has lower asymptotic mean squared error than the MLE (or other estimators attaining this bound). + \item \textit{Second-order efficiency} after correction for bias. +\end{enumerate} +\href{https://en.wikipedia.org/wiki/Divergence_(statistics)}{wikipedia} + + \section{Divergences} Conventions for this section: $P$ and $Q$ are probability measures over a set $X$, and $P$ is absolutely continuous with respect to $Q$. $S$ is a space of all probability distributions with common support. @@ -100,7 +167,29 @@ \subsection{Jensen–Shannon divergence} \end{align*}, where $M={\frac {1}{2}}(P+Q)$\\ \href{https://en.wikipedia.org/wiki/Jensen–Shannon_divergence}{wikipedia} -\subsection{TODO: Wasserstein \& Wasserstein Dual} +\subsection{Wasserstein Metric} +\begin{enumerate} + \item Also often called Earth-Mover's distance (EDM) + \item It differs from the usual KL-Divergence in that it's based on optimal-transport and not on local probability differences +\end{enumerate} +Let $(M,d)$ be a metric space with every probability measure on $M$ being a Radon measure (a so-called Radon space). For $p\geq 1$, let $\displaystyle P_{p}(M)$ denote the collection of all probability measures $\mu$ on $M$ with finite $\displaystyle p^{\text{th}}$ moment for some $x_{0}$ in $M$. +\subsubsection{Wasserstein Metric - Primal} +The $\displaystyle p^{\text{th}}$ Wasserstein distance between two probability measures $\mu$ and $\nu$ in $\displaystyle P_{p}(M)$ is defined as +\begin{align*} +W_{{p}}(\mu ,\nu ):=\left(\inf _{{\gamma \in \Gamma (\mu ,\nu )}}\int _{{M\times M}}d(x,y)^{{p}}\,{\mathrm {d}}\gamma (x,y)\right)^{{1/p}} +\end{align*}, +where $\displaystyle \Gamma (\mu ,\nu )$ denotes the collection of all measures on $M\times M$ with marginals $\mu$ and $\nu$. ($\Gamma (\mu ,\nu )$ is also called the set of all couplings of $\mu$ and $\nu$ .) +\subsubsection{Wasserstein Metric - Dual} +The following dual representation of $W_1$ is a special case of the duality theorem of Kantorovich and Rubinstein (1958): when $\mu$ and $\nu$ have bounded support, +\begin{align*} +W_{{1}}(\mu ,\nu )=\sup \left\{\left.\int _{{M}}f(x)\,{\mathrm {d}}(\mu -\nu )(x)\right|{\mbox{continuous }}f:M\to {\mathbb {R}},{\mathrm {Lip}}(f)\leq 1\right\} +\end{align*}, +where $Lip(f)$ denotes the minimal Lipschitz constant for $f$.\\ +If the metric $d$ is bounded by some constant $C$, then $2W_{{1}}(\mu ,\nu )\leq C\rho (\mu ,\nu )$, +and so convergence in the Radon metric (identical to total variation convergence when M is a Polish space) implies convergence in the Wasserstein metric, but not vice versa.\\ +This article uses material from the Wikipedia article +\href{https://en.wikipedia.org/wiki/Wasserstein_metric}{Wasserstein metric} which is released under the \href{https://creativecommons.org/licenses/by-sa/3.0/}{Creative Commons Attribution-Share-Alike License 3.0}.\\ +Good introductionary blog-post: \href{https://vincentherrmann.github.io/blog/wasserstein/}{wasserstein}. \section{Information Geometry} Information Geometry defines a Riemannian Manifold over probability distributions for statistical models.\\ diff --git a/main.pdf b/main.pdf index b36a4e9..7357759 100644 Binary files a/main.pdf and b/main.pdf differ diff --git a/main.tex b/main.tex index 0f85a5c..badbf79 100644 --- a/main.tex +++ b/main.tex @@ -6,6 +6,16 @@ %\usepackage{amsmath} \usepackage{mlka_math} +\usepackage{pgfplots} +\usetikzlibrary{matrix} + +\pgfplotscreateplotcyclelist{color-scheme}{ +{SchoolColor}, +{matching1}, +{matching2}, +{matching3}, +{matching4}, +} \begin{document} @@ -19,8 +29,9 @@ % \doublespacing % include each chapter... -\include{ReinforcementLearning} \include{Statistics} +\include{NeuralNetworks} +\include{ReinforcementLearning} \setstretch{\dnormalspacing}