Generalize backprop of base arg grads, add tests.

ztangent · ztangent · commit 449c640a42a6 · 2023-01-10T01:37:48.000-05:00
diff --git a/src/modeling_library/dist_dsl/dist_dsl.jl b/src/modeling_library/dist_dsl/dist_dsl.jl
@@ -26,14 +26,73 @@ all_indices(arg::SimpleArg) = [arg.i]
 all_indices(arg::TransformedArg) = vcat([all_indices(a) for a in arg.f_args]...)
 
 # Evaluate user-facing args to concrete values passed to the base distribution
-eval_arg(x::Any, args) = x
-eval_arg(x::SimpleArg, args) = typecheck_arg(x, args[x.i])
-eval_arg(x::TransformedArg, args) =
-    x.arg_passer(x.orig_f, [eval_arg(a, args) for a in x.f_args]...)
+eval_arg(base_arg::Any, args) = base_arg
+eval_arg(base_arg::SimpleArg, args) = typecheck_arg(base_arg, args[base_arg.i])
+eval_arg(base_arg::TransformedArg, args) =
+    base_arg.arg_passer(base_arg.orig_f, (eval_arg(a, args) for a in base_arg.f_args)...)
+
+# Evaluate gradients of base distribution args with respect to user-facing args
+function eval_arg_gradient(base_arg::Any, base_type::Type, args)
+    grads = map(enumerate(args)) do (i, arg)
+        if arg isa Real || arg isa AbstractArray && eltype(arg) <: Real
+            zero(arg) # Base arg is always constant with respect to input args
+        else
+            nothing
+        end
+    end
+    return grads
+end
+
+function eval_arg_gradient(base_arg::SimpleArg{T}, base_type::Type, args) where {T}
+    grads = map(enumerate(args)) do (i, arg)
+        if arg isa Real # Base arg is either equal to or unaffected by input arg
+            i == base_arg.i ? one(arg) : zero(arg)
+        elseif arg isa AbstractArray && eltype(arg) <: Real
+            N, V = length(arg), eltype(arg)
+            i == base_arg.i ? Matrix{V}(LinearAlgebra.I, N, N) : zeros(V, N, N) 
+        else
+            nothing
+        end
+    end
+    return grads
+end
+
+# Compute gradients when base arg is a scalar type
+function eval_arg_gradient(base_arg::TransformedArg, base_type::Type{<:Real}, args)
+    splice_arg(arg, i) = [args[1:i-1]..., arg, args[i+1:end]...]
+    per_arg_eval(arg, i) = eval_arg(base_arg, splice_arg(arg, i))
+    grads = map(enumerate(args)) do (i, arg)
+        if arg isa Real
+            ReverseDiff.gradient(a -> per_arg_eval(a, i), [arg])[1]
+        elseif arg isa AbstractArray && eltype(arg) <: Real
+            ReverseDiff.gradient(a -> per_arg_eval(a, i), arg)
+        else
+            nothing
+        end
+    end
+    return grads
+end
+
+# Compute Jacobians when base arg is an array type
+function eval_arg_gradient(base_arg::TransformedArg, base_type::Type{<:AbstractArray{<:Real}}, args)
+    splice_arg(arg, i) = [args[1:i-1]..., arg, args[i+1:end]...]
+    per_arg_eval(arg, i) = eval_arg(base_arg, splice_arg(arg, i))
+    grads = map(enumerate(args)) do (i, arg)
+        if arg isa Real
+            ReverseDiff.jacobian(a -> per_arg_eval(a, i), [arg])
+        elseif arg isa AbstractArray && eltype(arg) <: Real
+            ReverseDiff.jacobian(a -> per_arg_eval(a, i), arg)
+        else
+            nothing
+        end
+    end
+    return grads
+end
 
 # Type of SimpleArg must match arg, otherwise a MethodError will be thrown
-typecheck_arg(x::SimpleArg{T}, arg::T) where {T} = arg
-typecheck_arg(x::SimpleArg{T}, arg::ReverseDiff.TrackedReal{T}) where {T <: Real} = arg
+typecheck_arg(base_arg::SimpleArg{T}, arg::T) where {T} = arg
+typecheck_arg(base_arg::SimpleArg{T}, arg::ReverseDiff.TrackedReal{T}) where {T <: Real} = arg
+typecheck_arg(base_arg::SimpleArg{T}, arg::ReverseDiff.TrackedArray{V, D, N, T}) where {V, D, N, T} = arg
 
 # DistWithArgs
 struct DistWithArgs{T}
@@ -72,25 +131,32 @@ function logpdf_grad(d::CompiledDistWithArgs{T}, x::T, args...) where T
     concrete_args = [eval_arg(arg, args) for arg in d.arglist]
     base_has_arg_grads = has_argument_grads(d.base)
     base_grads = logpdf_grad(d.base, x, concrete_args...)
-
-    base_arg_grads = [g for (i, g) in enumerate(base_grads[2:end])
-                      if base_has_arg_grads[i]]
-    argvec = collect(args)
-    if !isempty(argvec)
-        eval_arg_grads = [ReverseDiff.gradient(xs -> eval_arg(arg, xs), argvec) for
-                          (i, arg) in enumerate(d.arglist) if base_has_arg_grads[i]]
-        eval_arg_grads = reduce(hcat, eval_arg_grads)
-    end
-
-    retval = [base_grads[1]]
-    for i in 1:d.n_args
-        if self_has_arg_grads[i]
-            push!(retval, eval_arg_grads[i,:]' * base_arg_grads)
-        else
-            push!(retval, nothing)
+    base_arg_grads = base_grads[2:end]
+
+    # Set gradient with respect to output
+    self_output_grad = base_grads[1] 
+
+    # Backpropagate gradients from base arguments to arguments
+    self_arg_grads = [self_has_arg_grads[i] ? zero(arg) : nothing
+                      for (i, arg) in enumerate(args)]
+
+    for (i, base_arg) in enumerate(d.arglist)
+        base_has_arg_grads[i] || continue
+        base_grad = base_arg_grads[i]
+        base_arg_type = typeof(concrete_args[i])
+        eval_arg_grad = eval_arg_gradient(base_arg, base_arg_type, args) 
+        for (j, g) in enumerate(eval_arg_grad)
+            (isnothing(g) || !self_has_arg_grads[j]) && continue
+            if base_grad isa AbstractArray
+                increment = reshape(g' * vec(base_grad), size(self_arg_grads[j]))
+            else
+                increment = g * base_grad
+            end
+            self_arg_grads[j] = self_arg_grads[j] .+ increment
         end
     end
-    return Tuple(retval)
+
+    return (self_output_grad, self_arg_grads...)
 end
 
 function random(d::CompiledDistWithArgs{T}, args...)::T where T
diff --git a/src/modeling_library/dist_dsl/relabeled_distribution.jl b/src/modeling_library/dist_dsl/relabeled_distribution.jl
@@ -19,13 +19,13 @@ function logpdf(d::WithLabelArg{T, U}, x::T, collection, base_args...) where {T,
 end
 
 function logpdf_grad(d::WithLabelArg{T, U}, x::T, collection, base_args...) where {T, U}
-    base_arg_grads = fill(nothing, length(base_args))
+    base_arg_grads = Vector{Any}(nothing, length(base_args))
 
     for p in pairs(collection)
         (index, item) = (p.first, p.second)
         if item == x
             new_grads = logpdf_grad(d.base, index, base_args...)
-            for (arg_idx, grad) in enumerate(new_grads)
+            for (arg_idx, grad) in enumerate(new_grads[2:end])
                 if base_arg_grads[arg_idx] === nothing
                     base_arg_grads[arg_idx] = grad
                 elseif grad !== nothing
@@ -73,13 +73,13 @@ function logpdf(d::RelabeledDistribution{T, U}, x::T, base_args...) where {T, U}
 end
 
 function logpdf_grad(d::RelabeledDistribution{T, U}, x::T, base_args...) where {T, U}
-    base_arg_grads = fill(nothing, length(base_args))
+    base_arg_grads = Vector{Any}(nothing, length(base_args))
 
     for p in pairs(d.collection)
         (index, item) = (p.first, p.second)
         if item == x
             new_grads = logpdf_grad(d.base, index, base_args...)
-            for (arg_idx, grad) in enumerate(new_grads)
+            for (arg_idx, grad) in enumerate(new_grads[2:end])
                 if base_arg_grads[arg_idx] === nothing
                     base_arg_grads[arg_idx] = grad
                 elseif grad !== nothing
diff --git a/test/modeling_library/dist_dsl.jl b/test/modeling_library/dist_dsl.jl
@@ -5,20 +5,40 @@
   @test isapprox(logpdf(f, 1., 0.), logpdf(normal, 0., 0., 0.001))
 
   # Test gradients of transformed distributions
-  @dist shifted_normal(mu, sigma) = Gen.normal(mu, sigma) + 1.
-  @test isapprox(logpdf(shifted_normal, 1., 0., 1.), logpdf(normal, 0., 0., 1.))
+  @dist shifted_normal(mu, sigma) = normal(mu, sigma) + 1.
+  @test logpdf(shifted_normal, 1., 0., 1.) == logpdf(normal, 0., 0., 1.)
   @test logpdf_grad(shifted_normal, 0., 0., 1.) == logpdf_grad(normal, -1., 0., 1.)
 
   # Test gradients of transformed distributions with no parameters
-  @dist shifted_std_normal() = Gen.normal(0., 1.) + 1.
-  @test isapprox(logpdf(shifted_std_normal, 1.), logpdf(normal, 0., 0., 1.))
+  @dist shifted_std_normal() = normal(0., 1.) + 1.
+  @test logpdf(shifted_std_normal, 1.) == logpdf(normal, 0., 0., 1.)
   @test logpdf_grad(shifted_std_normal, 0.) == (logpdf_grad(normal, -1., 0., 1.)[1],)
 
+  # Test gradients of multivariate distributions
+  @dist vec_normal(mu, sigma) = broadcasted_normal(broadcast(+, mu, 1.0), broadcast(*, sigma, 2.0))
+  @test logpdf(vec_normal, zeros(2), zeros(2), ones(2)) ==
+        logpdf(broadcasted_normal, zeros(2), ones(2), 2 .* ones(2))
+  transformed_grads = logpdf_grad(vec_normal, zeros(2), zeros(2), ones(2))
+  orig_grads = logpdf_grad(broadcasted_normal, zeros(2), ones(2), 2 .* ones(2)) 
+  @test transformed_grads[1] == orig_grads[1]
+  @test transformed_grads[2] == orig_grads[2]
+  @test transformed_grads[3] == 2.0 * orig_grads[3]
+      
+  # Test gradients of multivariate distributions with multi-dimensional arguments
+  @dist transformed_mvnormal(mu, sigma) = mvnormal(broadcast(+, mu, 1.0), broadcast(*, sigma, 2.0))
+  @test logpdf(transformed_mvnormal, zeros(2), zeros(2), ones(2, 2)) ==
+        logpdf(mvnormal, zeros(2), ones(2), 2 .* ones(2, 2))
+  transformed_grads = logpdf_grad(transformed_mvnormal, zeros(2), zeros(2), ones(2, 2))
+  orig_grads = logpdf_grad(mvnormal, zeros(2), ones(2), 2 .* ones(2, 2)) 
+  @test transformed_grads[1] == orig_grads[1]
+  @test transformed_grads[2] == orig_grads[2]
+  @test transformed_grads[3] == 2.0 * orig_grads[3]
+
   # Test relabeled distributions with labels provided as an Array
   @dist labeled_cat(labels, probs) = labels[categorical(probs)]
   @test labeled_cat([:a, :b], [0., 1.]) == :b
   @test isapprox(logpdf(labeled_cat, :b, [:a, :b], [0.5, 0.5]), log(0.5))
-  @test logpdf_grad(labeled_cat, :b, [:a, :b], [0.5, 0.5]) == logpdf_grad(categorical, 2, [0.5, 0.5])
+  @test logpdf_grad(labeled_cat, :b, [:a, :b], [0.5, 0.5]) == (nothing, logpdf_grad(categorical, 2, [0.5, 0.5])...)
   @test logpdf(labeled_cat, :c, [:a, :b], [0.5, 0.5]) == -Inf
 
   # Test relabeled distributions with labels provided in a Dict
@@ -55,7 +75,7 @@ end
   @test symbol_cat([:a, :b], [0., 1.]) == :b
   @test_throws MethodError symbol_cat(["a", "b"], [0., 1.])
   @test logpdf(symbol_cat, :c, [:a, :b], [0.5, 0.5]) == -Inf
-  @test logpdf_grad(symbol_cat, :b, [:a, :b], [0.5, 0.5]) == logpdf_grad(categorical, 2, [0.5, 0.5])
+  @test logpdf_grad(symbol_cat, :b, [:a, :b], [0.5, 0.5]) == (nothing, logpdf_grad(categorical, 2, [0.5, 0.5])...)
   @test_throws MethodError logpdf(symbol_cat, "c", [:a, :b], [0.5, 0.5])
 
   # Test typed parameters