Add variance reduction for proba dist rrule (#20)

BatyLeo · gdalle · web-flow · commit fb0d118f012a · 2024-08-05T11:10:39.000+02:00
* Support keyword arguments in  method applied to FixedAtomProbabilityDistribution with function f

* update docs

* Typos

* remove kwargs from mean

---------

Co-authored-by: Guillaume Dalle &lt;22795598+gdalle@users.noreply.github.com&gt;
diff --git a/docs/src/DiffExp.bib b/docs/src/DiffExp.bib
@@ -1,49 +1,50 @@
 @misc{blondelElementsDifferentiableProgramming2024,
-  title = {The {{Elements}} of {{Differentiable Programming}}},
-  author = {Blondel, Mathieu and Roulet, Vincent},
-  year = {2024},
-  month = mar,
-  number = {arXiv:2403.14606},
-  eprint = {2403.14606},
-  primaryclass = {cs},
-  publisher = {arXiv},
-  doi = {10.48550/arXiv.2403.14606},
-  url = {http://arxiv.org/abs/2403.14606},
-  urldate = {2024-03-22},
-  abstract = {Artificial intelligence has recently experienced remarkable advances, fueled by large models, vast datasets, accelerated hardware, and, last but not least, the transformative power of differentiable programming. This new programming paradigm enables end-to-end differentiation of complex computer programs (including those with control flows and data structures), making gradient-based optimization of program parameters possible. As an emerging paradigm, differentiable programming builds upon several areas of computer science and applied mathematics, including automatic differentiation, graphical models, optimization and statistics. This book presents a comprehensive review of the fundamental concepts useful for differentiable programming. We adopt two main perspectives, that of optimization and that of probability, with clear analogies between the two. Differentiable programming is not merely the differentiation of programs, but also the thoughtful design of programs intended for differentiation. By making programs differentiable, we inherently introduce probability distributions over their execution, providing a means to quantify the uncertainty associated with program outputs.},
-  archiveprefix = {arXiv},
+  title         = {The {{Elements}} of {{Differentiable Programming}}},
+  author        = {Blondel, Mathieu and Roulet, Vincent},
+  year          = {2024},
+  month         = mar,
+  number        = {arXiv:2403.14606},
+  eprint        = {2403.14606},
+  primaryclass  = {cs},
+  publisher     = {arXiv},
+  doi           = {10.48550/arXiv.2403.14606},
+  url           = {http://arxiv.org/abs/2403.14606},
+  urldate       = {2024-03-22},
+  abstract      = {Artificial intelligence has recently experienced remarkable advances, fueled by large models, vast datasets, accelerated hardware, and, last but not least, the transformative power of differentiable programming. This new programming paradigm enables end-to-end differentiation of complex computer programs (including those with control flows and data structures), making gradient-based optimization of program parameters possible. As an emerging paradigm, differentiable programming builds upon several areas of computer science and applied mathematics, including automatic differentiation, graphical models, optimization and statistics. This book presents a comprehensive review of the fundamental concepts useful for differentiable programming. We adopt two main perspectives, that of optimization and that of probability, with clear analogies between the two. Differentiable programming is not merely the differentiation of programs, but also the thoughtful design of programs intended for differentiation. By making programs differentiable, we inherently introduce probability distributions over their execution, providing a means to quantify the uncertainty associated with program outputs.},
+  archiveprefix = {arXiv}
 }
 % == BibTeX quality report for blondelElementsDifferentiableProgramming2024:
 % ? Title looks like it was stored in title-case in Zotero
 
 @article{koolBuyREINFORCESamples2022,
-  title = {Buy 4 {{REINFORCE Samples}}, {{Get}} a {{Baseline}} for {{Free}}!},
-  author = {Kool, Wouter and van Hoof, Herke and Welling, Max},
-  year = {2022},
-  month = jul,
-  url = {https://openreview.net/forum?id=r1lgTGL5DE},
-  urldate = {2023-04-17},
+  title    = {Buy 4 {{REINFORCE Samples}}, {{Get}} a {{Baseline}} for {{Free}}!},
+  author   = {Kool, Wouter and van Hoof, Herke and Welling, Max},
+  year     = {2022},
+  month    = jul,
+  journal  = {ICLR},
+  url      = {https://openreview.net/forum?id=r1lgTGL5DE},
+  urldate  = {2023-04-17},
   abstract = {REINFORCE can be used to train models in structured prediction settings to directly optimize the test-time objective. However, the common case of sampling one prediction per datapoint (input) is data-inefficient. We show that by drawing multiple samples (predictions) per datapoint, we can learn with significantly less data, as we freely obtain a REINFORCE baseline to reduce variance. Additionally we derive a REINFORCE estimator with baseline, based on sampling without replacement. Combined with a recent technique to sample sequences without replacement using Stochastic Beam Search, this improves the training procedure for a sequence model that predicts the solution to the Travelling Salesman Problem.},
-  langid = {english},
-  language = {en},
+  langid   = {english},
+  language = {en}
 }
 % == BibTeX quality report for koolBuyREINFORCESamples2022:
 % Missing required field 'journal'
 % ? Title looks like it was stored in title-case in Zotero
 % ? unused Library catalog ("openreview.net")
 
 @article{mohamedMonteCarloGradient2020,
-  title = {Monte {{Carlo Gradient Estimation}} in {{Machine Learning}}},
-  author = {Mohamed, Shakir and Rosca, Mihaela and Figurnov, Michael and Mnih, Andriy},
-  year = {2020},
-  journal = {Journal of Machine Learning Research},
-  volume = {21},
-  number = {132},
-  pages = {1--62},
-  issn = {1533-7928},
-  url = {http://jmlr.org/papers/v21/19-346.html},
-  urldate = {2022-10-21},
-  abstract = {This paper is a broad and accessible survey of the methods we have at our disposal for Monte Carlo gradient estimation in machine learning and across the statistical sciences: the problem of computing the gradient of an expectation of a function with respect to parameters defining the distribution that is integrated; the problem of sensitivity analysis. In machine learning research, this gradient problem lies at the core of many learning problems, in supervised, unsupervised and reinforcement learning. We will generally seek to rewrite such gradients in a form that allows for Monte Carlo estimation, allowing them to be easily and efficiently used and analysed. We explore three strategies---the pathwise, score function, and measure-valued gradient estimators---exploring their historical development, derivation, and underlying assumptions. We describe their use in other fields, show how they are related and can be combined, and expand on their possible generalisations. Wherever Monte Carlo gradient estimators have been derived and deployed in the past, important advances have followed. A deeper and more widely-held understanding of this problem will lead to further advances, and it is these advances that we wish to support.},
+  title    = {Monte {{Carlo Gradient Estimation}} in {{Machine Learning}}},
+  author   = {Mohamed, Shakir and Rosca, Mihaela and Figurnov, Michael and Mnih, Andriy},
+  year     = {2020},
+  journal  = {Journal of Machine Learning Research},
+  volume   = {21},
+  number   = {132},
+  pages    = {1--62},
+  issn     = {1533-7928},
+  url      = {http://jmlr.org/papers/v21/19-346.html},
+  urldate  = {2022-10-21},
+  abstract = {This paper is a broad and accessible survey of the methods we have at our disposal for Monte Carlo gradient estimation in machine learning and across the statistical sciences: the problem of computing the gradient of an expectation of a function with respect to parameters defining the distribution that is integrated; the problem of sensitivity analysis. In machine learning research, this gradient problem lies at the core of many learning problems, in supervised, unsupervised and reinforcement learning. We will generally seek to rewrite such gradients in a form that allows for Monte Carlo estimation, allowing them to be easily and efficiently used and analysed. We explore three strategies---the pathwise, score function, and measure-valued gradient estimators---exploring their historical development, derivation, and underlying assumptions. We describe their use in other fields, show how they are related and can be combined, and expand on their possible generalisations. Wherever Monte Carlo gradient estimators have been derived and deployed in the past, important advances have followed. A deeper and more widely-held understanding of this problem will lead to further advances, and it is these advances that we wish to support.}
 }
 % == BibTeX quality report for mohamedMonteCarloGradient2020:
 % ? Title looks like it was stored in title-case in Zotero
diff --git a/docs/src/background.md b/docs/src/background.md
@@ -5,7 +5,7 @@ Most of the math below is taken from [mohamedMonteCarloGradient2020](@citet).
 Consider a function $f: \mathbb{R}^n \to \mathbb{R}^m$, a parameter $\theta \in \mathbb{R}^d$ and a parametric probability distribution $p(\theta)$ on the input space.
 Given a random variable $X \sim p(\theta)$, we want to differentiate the expectation of $Y = f(X)$ with respect to $\theta$:
 
-$$E(\theta) = \mathbb{E}[f(X)] = \int f(x) ~ p(x | \theta) ~\mathrm{d} x$$
+$$E(\theta) = \mathbb{E}[f(X)] = \int f(x) ~ p(x | \theta) ~\mathrm{d} x = \int y ~ q(y | \theta) ~\mathrm{d} y$$
 
 Usually this is approximated with Monte-Carlo sampling: let $x_1, \dots, x_S \sim p(\theta)$ be i.i.d., we have the estimator
 
@@ -15,7 +15,7 @@ $$E(\theta) \simeq \frac{1}{S} \sum_{s=1}^S f(x_s)$$
 
 Since $E$ is a vector-to-vector function, the key quantity we want to compute is its Jacobian matrix $\partial E(\theta) \in \mathbb{R}^{m \times n}$:
 
-$$\partial E(\theta) = \int y ~ \nabla_\theta q(y | \theta)^\top ~ \mathrm{d} y = \int f(x) ~ \nabla_\theta p(x | \theta)^\top ~\mathrm{d} x$$
+$$\partial E(\theta) = \int f(x) ~ \nabla_\theta p(x | \theta)^\top ~\mathrm{d} x = \int y ~ \nabla_\theta q(y | \theta)^\top ~ \mathrm{d} y$$
 
 However, to implement automatic differentiation, we only need the vector-Jacobian product (VJP) $\partial E(\theta)^\top \bar{y}$ with an output cotangent $\bar{y} \in \mathbb{R}^m$.
 See the book by [blondelElementsDifferentiableProgramming2024](@citet) to know more.
@@ -33,7 +33,7 @@ The REINFORCE estimator is derived with the help of the identity $\nabla \log u
 $$\begin{aligned}
 \partial E(\theta)
 & = \int f(x) ~ \nabla_\theta p(x | \theta)^\top ~ \mathrm{d}x \\
-& = \int f(x) ~ p(x | \theta) \nabla_\theta \log p(x | \theta)^\top ~ \mathrm{d}x \\
+& = \int f(x) ~ \nabla_\theta \log p(x | \theta)^\top p(x | \theta) ~ \mathrm{d}x \\
 & = \mathbb{E} \left[f(X) \nabla_\theta \log p(X | \theta)^\top\right] \\
 \end{aligned}$$
 
@@ -53,7 +53,7 @@ For $S > 1$ Monte-Carlo samples, we have
 
 $$\begin{aligned}
 \partial E(\theta)^\top \bar{y} 
-& \simeq \frac{1}{S} \sum_{s=1}^S \left(f(x_s) - \frac{1}{S - 1}\sum_{j\neq i} f(x_j) \right)^\top \bar{y} ~ \nabla_\theta\log p(x_s | \theta)\\
+& \simeq \frac{1}{S} \sum_{s=1}^S \left(f(x_s) - \frac{1}{S - 1}\sum_{j\neq s} f(x_j) \right)^\top \bar{y} ~ \nabla_\theta\log p(x_s | \theta)\\
 & = \frac{1}{S - 1}\sum_{s=1}^S (f(x_s) - b)^\top \bar{y} ~ \nabla_\theta\log p(x_s | \theta)
 \end{aligned}$$
 
@@ -90,38 +90,55 @@ The following reparametrizations are implemented:
 
 ## Probability gradients
 
-In addition to the expectation, we may also want gradients for individual output densities $q(y | \theta) = \mathbb{P}(f(X) = y)$.
+In the case where $f$ is a function that takes values in a finite set $\mathcal{Y} = \{y_1, \cdots, y_K\}$, we may also want to compute the jacobian of the probability weights vector:
+
+$$q : \theta \longmapsto \begin{pmatrix} q(y_1|\theta) = \mathbb{P}(f(X) = y_1|\theta) \\ \dots \\ q(y_K|\theta) = \mathbb{P}(f(X) = y_K|\theta) \end{pmatrix}$$
+
+whose Jacobian is given by
+
+$$\partial_\theta q(\theta) = \begin{pmatrix} \nabla_\theta q(y_1|\theta)^\top \\ \dots \\ \nabla_\theta q(y_K|\theta)^\top \end{pmatrix}$$
 
 ### REINFORCE probability gradients
 
 The REINFORCE technique can be applied in a similar way:
 
-$$q(y | \theta) = \mathbb{E}[\mathbf{1}\{f(X) = y\}]  = \int \mathbf{1} \{f(x) = y\} ~ p(x | \theta) ~ \mathrm{d}x$$
+$$q(y_k | \theta) = \mathbb{E}[\mathbf{1}\{f(X) = y_k\}]  = \int \mathbf{1} \{f(x) = y_k\} ~ p(x | \theta) ~ \mathrm{d}x$$
 
 Differentiating through the integral,
 
 $$\begin{aligned}
-\nabla_\theta q(y | \theta)
-& = \int \mathbf{1} \{f(x) = y\} ~ \nabla_\theta p(x | \theta) ~ \mathrm{d}x \\
-& = \mathbb{E} [\mathbf{1} \{f(X) = y\} ~ \nabla_\theta \log p(X | \theta)]
+\nabla_\theta q(y_k | \theta)
+& = \int \mathbf{1} \{f(x) = y_k\} ~ \nabla_\theta p(x | \theta) ~ \mathrm{d}x \\
+& = \mathbb{E} [\mathbf{1} \{f(X) = y_k\} ~ \nabla_\theta \log p(X | \theta)]
 \end{aligned}$$
 
 The Monte-Carlo approximation for this is
 
-$$\nabla_\theta q(y | \theta) \simeq \frac{1}{S} \sum_{s=1}^S \mathbf{1} \{f(x_s) = y\} ~ \nabla_\theta \log p(x_s | \theta)$$
+$$\nabla_\theta q(y_k | \theta) \simeq \frac{1}{S} \sum_{s=1}^S \mathbf{1} \{f(x_s) = y_k\} ~ \nabla_\theta \log p(x_s | \theta)$$
+
+The VJP is then
 
-In our implementation, we assume that the sampled $y_s$ are pairwise distinct (maybe not necessary?), and that together they form the whole support of the distribution $q$.
-We can thus consider the vector-to-vector mapping
+$$\begin{aligned}
+\partial_\theta q(\theta)^\top \bar{q} &= \sum_{k=1}^K \bar{q}_k \nabla_\theta q(y_k | \theta)\\
+&\simeq  \frac{1}{S} \sum_{s=1}^S \left[\sum_{k=1}^K \bar{q}_k \mathbf{1} \{f(x_s) = y_k\}\right] ~ \nabla_\theta \log p(x_s | \theta)
+\end{aligned}$$
 
-$$q : \theta \longmapsto \begin{pmatrix} q(y_1|\theta) \\ \dots \\ q(y_S | \theta) \end{pmatrix}$$
+In our implementation, the [`empirical_distribution`](@ref) method outputs an empirical [`FixedAtomsProbabilityDistribution`](@ref) with uniform weights $\frac{1}{S}$, where some $x_s$ can be repeated.
 
-whose Jacobian is given by
+$$q : \theta \longmapsto \begin{pmatrix} q(f(x_1)|\theta) \\ \dots \\ q(f(x_S) | \theta) \end{pmatrix}$$
+
+We therefore define the corresponding VJP as
+
+$$\partial_\theta q(\theta)^\top \bar{q} = \frac{1}{S} \sum_{s=1}^S \bar{q}_s \nabla_\theta \log p(x_s | \theta)$$
 
-$$\partial_\theta q(\theta) = \frac{1}{S} \begin{pmatrix} \nabla_\theta \log p(x_1 | \theta)^\top \\ \dots \\ \nabla_\theta \log p(x_S | \theta)^\top \end{pmatrix}$$
+If $\bar q$ comes from `mean`, we have $\bar q_s = f(x_s)^\top \bar y$ and we obtain the REINFORCE VJP.
 
-and whose VJP is given by
+This VJP can be interpreted as an empirical expectation, to which we can also apply variance reduction:
+$$\partial_\theta q(\theta)^\top \bar q \approx \frac{1}{S-1}\sum_s(\bar q_s - b') \nabla_\theta \log p(x_s|\theta)$$
+with $b' = \frac{1}{S}\sum_s \bar q_s$.
 
-$$\partial_\theta q(\theta)^\top \bar{q} = \frac{1}{S} \sum_s \bar{q}_s \nabla_\theta \log p(x_s | \theta)$$
+Again, if $\bar q$ comes from `mean`, we have $\bar q_s = f(x_s)^\top \bar y$ and $b' = b^\top \bar y$. We then obtain the REINFORCE backward rule with variance reduction:
+$$\partial_\theta q(\theta)^\top \bar q \approx \frac{1}{S-1}\sum_s(f(x_s) - b)^\top \bar y \nabla_\theta \log p(x_s|\theta)$$
 
 ### Reparametrization probability gradients
 
diff --git a/src/reinforce.jl b/src/reinforce.jl
@@ -124,8 +124,12 @@ function ChainRulesCore.rrule(
 end
 
 function ChainRulesCore.rrule(
-    rc::RuleConfig, ::typeof(empirical_distribution), E::Reinforce, θ...; kwargs...
-)
+    rc::RuleConfig,
+    ::typeof(empirical_distribution),
+    E::Reinforce{t,variance_reduction},
+    θ...;
+    kwargs...,
+) where {t,variance_reduction}
     project_θ = ProjectTo(θ)
 
     (; f, nb_samples) = E
@@ -137,12 +141,22 @@ function ChainRulesCore.rrule(
     _dist_logdensity_grad_partial(x) = dist_logdensity_grad(rc, E, x, θ...)
     gs = mymap(is_threaded(E), _dist_logdensity_grad_partial, xs)
 
+    adjusted_nb_samples = nb_samples - (variance_reduction && nb_samples > 1)
+
     function pullback_Reinforce_probadist(Δdist_thunked)
         Δdist = unthunk(Δdist_thunked)
         Δps = Δdist.weights
+        Δps_mean = mean(Δps)
+        Δps_baseline = if (variance_reduction && nb_samples > 1)
+            Δps .- Δps_mean
+        else
+            Δps
+        end
         ΔE = @not_implemented("The fields of the `Reinforce` object are constant.")
         _single_sample_pullback(gᵢ, Δpᵢ) = gᵢ .* Δpᵢ
-        Δθ = mymapreduce(is_threaded(E), _single_sample_pullback, .+, gs, Δps) ./ nb_samples
+        Δθ =
+            mymapreduce(is_threaded(E), _single_sample_pullback, .+, gs, Δps_baseline) ./
+            adjusted_nb_samples
         Δθ_proj = project_θ(Δθ)
         return (NoTangent(), ΔE, Δθ_proj...)
     end
diff --git a/test/distribution.jl b/test/distribution.jl
@@ -17,8 +17,6 @@ rng = StableRNG(63)
 for threaded in (false, true)
     dist = FixedAtomsProbabilityDistribution([2, 3], [0.4, 0.6]; threaded)
 
-    string(dist)
-
     @test length(dist) == 2
 
     @test mean(dist) ≈ 2.6
diff --git a/test/expectation.jl b/test/expectation.jl
@@ -129,5 +129,5 @@ end
     )
     r_split(θ...) = mean(empirical_distribution(r, θ...))
     @test r(μ, σ) == r_split(μ, σ)
-    @test_broken gradient(r, μ, σ) == gradient(r_split, μ, σ)
+    @test all(isapprox.(gradient(r, μ, σ), gradient(r_split, μ, σ); atol=1e-10))
 end

Original file line number	Diff line number	Diff line change
`@@ -129,5 +129,5 @@ end`
`129`	`129`	`)`
`130`	`130`	`r_split(θ...) = mean(empirical_distribution(r, θ...))`
`131`	`131`	`@test r(μ, σ) == r_split(μ, σ)`
`132`		`- @test_broken gradient(r, μ, σ) == gradient(r_split, μ, σ)`
	`132`	`+ @test all(isapprox.(gradient(r, μ, σ), gradient(r_split, μ, σ); atol=1e-10))`
`133`	`133`	`end`