lecture-notes/ift6268.tex at master · mnoukhov/lecture-notes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
\documentclass[]{article}
\usepackage{etex}
\usepackage[margin = 1.5in]{geometry}
\setlength{\parindent}{0in}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{listings}
\usepackage{color}
\usepackage{mathtools}
\usepackage{multicol}
\usepackage[lined]{algorithm2e}
\usepackage{float}
\usepackage[T1]{fontenc}
\usepackage{ae,aecompl}
\usepackage[pdftex,
  pdfauthor={Michael Noukhovitch},
  pdftitle={},
  pdfsubject={Lecture notes from },
  pdfproducer={LaTeX},
  pdfcreator={pdflatex}]{hyperref}

\usepackage{cleveref}
\usepackage{enumitem}

\definecolor{dkgreen}{rgb}{0,0.6,0}
\definecolor{gray}{rgb}{0.5,0.5,0.5}
\definecolor{mauve}{rgb}{0.58,0,0.82}

\lstset{
  language=C,
  aboveskip=3mm,
  belowskip=3mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numbers=none,
  numberstyle=\tiny\color{gray},
  keywordstyle=\color{blue},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  breaklines=true,
  breakatwhitespace=true,
  tabsize=4
}

\theoremstyle{definition}
\newtheorem*{defn}{Definition}
\newtheorem{ex}{Example}[section]
\newtheorem*{theorem}{Theorem}

\setlength{\marginparwidth}{1.5in}
\setlength{\algomargin}{0.75em}

\DeclarePairedDelimiter{\set}{\lbrace}{\rbrace}

\definecolor{darkish-blue}{RGB}{25,103,185}

\usepackage{hyperref}
\hypersetup{
    colorlinks,
    citecolor=darkish-blue,
    filecolor=darkish-blue,
    linkcolor=darkish-blue,
    urlcolor=darkish-blue
}
\newcommand{\lecture}[1]{\marginpar{{\footnotesize $\leftarrow$ \underline{#1}}}}

\makeatletter
\def\blfootnote{\gdef\@thefnmark{}\@footnotetext}
\makeatother

\begin{document}
    \let\ref\Cref

    \title{\bf{IFT 6268 Self-Supervised Representation Learning}}
    \date{Fall 2020 \\ \center Notes written from Aaron Courville's lectures.}
    \author{Michael Noukhovitch}

    \maketitle
    \newpage
    \tableofcontents
    \newpage


    \section{Introduction}

    \subsection{Motivation}%
    \label{sub:motivation}

    We want good representation learning but
    \begin{itemize}
        \item scaling supervised learning isn't feasible
        \item we need to learn useful representations unsupervised, but need more info
    \end{itemize}

    \textbf{Self-supervised learning} (SSL) recover useful/semantic representations by training models to answer specific questions about the data
    \begin{itemize}
        \item between supervised and unsupervised
        \item can procedurally generate infinite annotation
        \item answering the question requires fundamental understanding
        \item main challenge is choosing a good question that allows to learn \textit{without} extra labels
    \end{itemize}

    \subsection{Autoencoders}%
    \label{sub:autoencoders}

    Generative models (AE, VAE) have been historically ineffective compared to supervised
    \begin{itemize}
        \item caveat of LM (BERT predicting input)
        \item caveat of small data (unsupervised pretraining)
    \end{itemize}

    Old school AE
    \begin{itemize}
        \item couldn't learn semantics of MNIST
        \item even adding depth didn't help
    \end{itemize}

    denoising AE (Vincent et al, 2008) reproduces from noisy $x$
    \begin{itemize}
        \item $x + \text{noise} = \tilde x$ used to predict $x$
        \item representation robust to noise
        \item formally corresponds to learning an energy function that has lows at true $x$
    \end{itemize}

    contractive AE (Rifai et al, 2011) adds loss on all information
    \begin{itemize}
        \item autoencoder loss to keep ``good'' information
        \item secondary loss on the jacobian of the encoder to throw away all information
        \item total effect to reduce ``bad'' information kept
        \item can use second order methods and might be more stable than dAE
    \end{itemize}


    \subsection{Transfer Learning}%
    \label{sub:transfer_learning}

    Domain $D$ consists of a feature space $\mathbb{X}$ and marginal po

    Given a source domain $D_s$ and learning task $T_s$, a target domain $D_t$ and learning task $T_t$. \textbf{Transfer learning} aims to improve learning a function $f$ to
    \begin{enumerate}
        \item \textbf{inductive} same domain, different tasks
        \item \textbf{transductive} different domain, same task \textbf{domain adaptation}
        \item \textbf{unsupervised} both different
    \end{enumerate}

    SSL is usually source unlabelled, target labelled

    For SSL, the source task $T_s$
    \begin{itemize}
        \item unsupervised
        \item extracts semantic information from input
        \item learns correct invariances
    \end{itemize}

    \subsection{Image Methods}%
    \label{sub:questions}

    Rotation prediction: how much each image is rotated
    \begin{itemize}
        \item generate your own label
        \item but learning the answer gets you some semantic knowledge about images
    \end{itemize}


    GANs: is it SSL?
    \begin{itemize}
        \item if you're just creating a generator, no (just generative modelling)
        \item learning a discriminator augmented with fake data (Salimans et al, 2016), yes
    \end{itemize}

    \subsection{Contrastive Methods}%
    \label{sub:contrastive_methods}

    Mutual information $I(X,Y) = D_{KL}(P_{X,Y} || P_X \otimes P_Y)$
    \begin{itemize}
        \item intersection of marginal entropies
        \item difficult to compute
    \end{itemize}

    MINE (Belghazi et al, 2018) optimize a lower bound to MI
    \begin{itemize}
        \item encode an image with a network
        \item learn a discriminator with lower bound MI loss
    \end{itemize}

    Deep Infomax (Hjelm et al, 2019) try to learn ``important'' image regions
    \begin{itemize}
        \item use local image patches
        \item MI between global vector and local patches
    \end{itemize}

    CPC (van der Oord, 2018)
    \begin{itemize}
        \item predict feature vectors from context vectors
        \item single neurons learn semantic meaning
    \end{itemize}

    \subsection{Iterated Learning}%
    \label{sub:iterated_learning}

    \textbf{Iterated learning} uses learners to teach other learners
    \begin{itemize}
        \item compositional structure of NL may have emerged through teaching language (Kirby et al, 2014)
        \item may encourage compositional structure in neural models
        \item may encourage systematic generalization
    \end{itemize}

    \textbf{Self-training} similarly learns from its own pseudo-labels
    \begin{itemize}
        \item self-training with noisy students (Xie et al, 2020) iterates on imagenet
    \end{itemize}

    \textbf{Systematic generalization} is generalizing through shared rules between training and testing, not shared distribution
    \begin{itemize}
        \item seq2seq models can't regularly do that (Lake and Baroni, 2016)
        \item maybe SSL can help here
    \end{itemize}


\end{document}