Add Integrated Gradients analyzer (#65)

adrhill · web-flow · commit af5a2d83c616 · 2022-05-25T15:57:27.000+02:00
* Reorganize input_augmentation.jl

* Rename `InputAugmentation` to `NoiseAugmentation`

* Add `InterpolationAugmentation ` and tests

* Add `IntegratedGradients` and tests

* Update docs and readme

* Add benchmark for `IntegratedGradients`

* Update deprecated MLDatasets calls to API of `v0.7`

* Allow any type of `Sampleable` in `NoiseAugmentation`
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ using BSON: @load
 model = strip_softmax(model)
 
 # Load input
-x, _ = MNIST.testdata(Float32, 10)
+x, _ = MNIST(Float32, :test)[10]
 input = reshape(x, 28, 28, 1, :)   # reshape to WHCN format
 
 # Run XAI method
@@ -55,6 +55,7 @@ Currently, the following analyzers are implemented:
 ├── Gradient
 ├── InputTimesGradient
 ├── SmoothGrad
+├── IntegratedGradients
 └── LRP
     ├── LRPZero
     ├── LRPEpsilon
@@ -66,7 +67,6 @@ Individual LRP rules like `ZeroRule`, `EpsilonRule`, `GammaRule` and `ZBoxRule`
 
 ## Roadmap
 In the future, we would like to include:
-- [Integrated Gradients](https://arxiv.org/abs/1703.01365)
 - [PatternNet](https://arxiv.org/abs/1705.05598)
 - [DeepLift](https://arxiv.org/abs/1704.02685)
 - [LIME](https://arxiv.org/abs/1602.04938)
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -22,6 +22,7 @@ algs = Dict(
     "LRPZero" => LRPZero,
     "LRPCustom" => LRPCustom, #modifies weights
     "SmoothGrad" => model -> SmoothGrad(model, 10),
+    "IntegratedGradients" => model -> IntegratedGradients(model, 10),
 )
 
 # Define benchmark
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -5,5 +5,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 ExplainableAI = "4f1bc3e1-d60d-4ed0-9367-9bdff9846d3b"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
+ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
+ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
diff --git a/docs/literate/advanced_lrp.jl b/docs/literate/advanced_lrp.jl
@@ -15,7 +15,7 @@ using BSON
 model = BSON.load("../model.bson", @__MODULE__)[:model]
 
 index = 10
-x, y = MNIST.testdata(Float32, index)
+x, _ = MNIST(Float32, :test)[10]
 input = reshape(x, 28, 28, 1, :);
 
 # ## Custom LRP composites
@@ -200,7 +200,7 @@ analyzer = LRPZero(model)
 # They in-place modify a pre-allocated array of the input relevance `Rₖ`
 # based on the input activation `aₖ` and output relevance `Rₖ₊₁`.
 #
-# Calling `analyze` then applies a foward-pass of the model, keeping track of
+# Calling `analyze` then applies a forward-pass of the model, keeping track of
 # the activations `aₖ` for each layer `k`.
 # The relevance `Rₖ₊₁` is then set to the output neuron activation and the rules are applied
 # in a backward-pass over the model layers and previous activations.
diff --git a/docs/literate/example.jl b/docs/literate/example.jl
@@ -29,11 +29,13 @@ model = BSON.load("../model.bson", @__MODULE__)[:model]
 # We use MLDatasets to load a single image from the MNIST dataset:
 using MLDatasets
 using ImageCore
+using ImageIO
+using ImageShow
 
 index = 10
-x, y = MNIST.testdata(Float32, index)
+x, _ = MNIST(Float32, :test)[10]
 
-MNIST.convert2image(x)
+convert2image(MNIST, x)
 
 # By convention in Flux.jl, this input needs to be resized to WHCN format by adding a color channel and batch dimensions.
 input = reshape(x, 28, 28, 1, :);
@@ -82,7 +84,7 @@ heatmap(input, analyzer, 5)
 # ## Input batches
 # ExplainableAI also supports input batches:
 batchsize = 100
-xs, _ = MNIST.testdata(Float32, 1:batchsize)
+xs, _ = MNIST(Float32, :test)[1:batchsize]
 batch = reshape(xs, 28, 28, 1, :) # reshape to WHCN format
 expl_batch = analyze(batch, analyzer);
 
@@ -106,6 +108,7 @@ mosaic(heatmap(batch, analyzer, 1); nrow=10)
 # ├── Gradient
 # ├── InputTimesGradient
 # ├── SmoothGrad
+# ├── IntegratedGradients
 # └── LRP
 #     ├── LRPZero
 #     ├── LRPEpsilon
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -11,11 +11,13 @@ LRP
 Gradient
 InputTimesGradient
 SmoothGrad
+IntegratedGradients
 ```
 
-`SmoothGrad` is a special case of `InputAugmentation`, which can be applied as a wrapper to any analyzer:
+`SmoothGrad` and `IntegratedGradients` are special cases of the input augmentation wrappers `NoiseAugmentation` and `InterpolationAugmentation`, which can be applied as a wrapper to any analyzer:
 ```@docs
-InputAugmentation
+NoiseAugmentation
+InterpolationAugmentation
 ```
 
 # LRP
diff --git a/src/ExplainableAI.jl b/src/ExplainableAI.jl
@@ -2,7 +2,7 @@ module ExplainableAI
 
 using Base.Iterators
 using LinearAlgebra
-using Distributions
+using Distributions: Distribution, Sampleable, Normal
 using Random: AbstractRNG, GLOBAL_RNG
 using Flux
 using Zygote
@@ -34,7 +34,8 @@ export analyze
 # Analyzers
 export AbstractXAIMethod
 export Gradient, InputTimesGradient
-export InputAugmentation, SmoothGrad
+export NoiseAugmentation, SmoothGrad
+export InterpolationAugmentation, IntegratedGradients
 export LRP, LRPZero, LRPEpsilon, LRPGamma
 
 # LRP rules
diff --git a/src/gradient.jl b/src/gradient.jl
@@ -61,4 +61,15 @@ in a neighborhood of the input, typically by adding Gaussian noise with mean 0.
 # References
 [1] Smilkov et al., SmoothGrad: removing noise by adding noise
 """
-SmoothGrad(model, n=50, args...) = InputAugmentation(Gradient(model), n, args...)
+SmoothGrad(model, n=50, args...) = NoiseAugmentation(Gradient(model), n, args...)
+
+"""
+    IntegratedGradients(analyzer, [n=50])
+    IntegratedGradients(analyzer, [n=50])
+
+Analyze model by using the Integrated Gradients method.
+
+# References
+[1] Sundararajan et al., Axiomatic Attribution for Deep Networks
+"""
+IntegratedGradients(model, n=50) = InterpolationAugmentation(Gradient(model), n)
diff --git a/src/input_augmentation.jl b/src/input_augmentation.jl
@@ -1,48 +1,3 @@
-"""
-    InputAugmentation(analyzer, n, [std=1, rng=GLOBAL_RNG])
-    InputAugmentation(analyzer, n, distribution, [rng=GLOBAL_RNG])
-
-A wrapper around analyzers that augments the input with `n` samples of additive noise sampled from `distribution`.
-This input augmentation is then averaged to return an `Explanation`.
-"""
-struct InputAugmentation{A<:AbstractXAIMethod,D<:Distribution,R<:AbstractRNG} <:
-       AbstractXAIMethod
-    analyzer::A
-    n::Integer
-    distribution::D
-    rng::R
-end
-function InputAugmentation(analyzer, n, distr, rng=GLOBAL_RNG)
-    return InputAugmentation(analyzer, n, distr, rng)
-end
-function InputAugmentation(analyzer, n, σ::Real=0.1f0, args...)
-    return InputAugmentation(analyzer, n, Normal(0.0f0, Float32(σ)^2), args...)
-end
-
-function (aug::InputAugmentation)(input, ns::AbstractNeuronSelector)
-    # Regular forward pass of model
-    output = aug.analyzer.model(input)
-    output_indices = ns(output)
-
-    # Call regular analyzer on augmented batch
-    augmented_input = add_noise(augment_batch_dim(input, aug.n), aug.distribution, aug.rng)
-    augmented_indices = augment_indices(output_indices, aug.n)
-    augmented_expl = aug.analyzer(augmented_input, AugmentationSelector(augmented_indices))
-
-    # Average explanation
-    return Explanation(
-        reduce_augmentation(augmented_expl.attribution, aug.n),
-        output,
-        output_indices,
-        augmented_expl.analyzer,
-        Nothing,
-    )
-end
-
-function add_noise(A::AbstractArray{T}, distr::Distribution, rng::AbstractRNG) where {T}
-    return A + T.(rand(rng, distr, size(A)))
-end
-
 """
     augment_batch_dim(input, n)
 
@@ -80,13 +35,13 @@ function reduce_augmentation(input::AbstractArray{T,N}, n) where {T<:AbstractFlo
     out = similar(input, eltype(input), out_size)
 
     axs = axes(input, N)
-    inds_before_N = ntuple(Returns(:), N - 1)
+    colons = ntuple(Returns(:), N - 1)
     for (i, ax) in enumerate(first(axs):n:last(axs))
-        view(out, inds_before_N..., i) .=
-            sum(view(input, inds_before_N..., ax:(ax + n - 1)); dims=N) / n
+        view(out, colons..., i) .= sum(view(input, colons..., ax:(ax + n - 1)); dims=N) / n
     end
     return out
 end
+
 """
     augment_indices(indices, n)
 
@@ -115,3 +70,117 @@ function augment_indices(inds::Vector{CartesianIndex{N}}, n) where {N}
         CartesianIndex{N}(idx..., i)
     end
 end
+
+"""
+    NoiseAugmentation(analyzer, n, [std=1, rng=GLOBAL_RNG])
+    NoiseAugmentation(analyzer, n, distribution, [rng=GLOBAL_RNG])
+
+A wrapper around analyzers that augments the input with `n` samples of additive noise sampled from `distribution`.
+This input augmentation is then averaged to return an `Explanation`.
+"""
+struct NoiseAugmentation{A<:AbstractXAIMethod,D<:Sampleable,R<:AbstractRNG} <:
+       AbstractXAIMethod
+    analyzer::A
+    n::Int
+    distribution::D
+    rng::R
+end
+function NoiseAugmentation(analyzer, n, distr::Sampleable, rng=GLOBAL_RNG)
+    return NoiseAugmentation(analyzer, n, distr::Sampleable, rng)
+end
+function NoiseAugmentation(analyzer, n, σ::Real=0.1f0, args...)
+    return NoiseAugmentation(analyzer, n, Normal(0.0f0, Float32(σ)^2), args...)
+end
+
+function (aug::NoiseAugmentation)(input, ns::AbstractNeuronSelector)
+    # Regular forward pass of model
+    output = aug.analyzer.model(input)
+    output_indices = ns(output)
+
+    # Call regular analyzer on augmented batch
+    augmented_input = add_noise(augment_batch_dim(input, aug.n), aug.distribution, aug.rng)
+    augmented_indices = augment_indices(output_indices, aug.n)
+    augmented_expl = aug.analyzer(augmented_input, AugmentationSelector(augmented_indices))
+
+    # Average explanation
+    return Explanation(
+        reduce_augmentation(augmented_expl.attribution, aug.n),
+        output,
+        output_indices,
+        augmented_expl.analyzer,
+        Nothing,
+    )
+end
+
+function add_noise(A::AbstractArray{T}, distr::Distribution, rng::AbstractRNG) where {T}
+    return A + T.(rand(rng, distr, size(A)))
+end
+
+"""
+    InterpolationAugmentation(model, [n=50])
+
+A wrapper around analyzers that augments the input with `n` steps of linear interpolation
+between the input and a reference input (typically `zero(input)`).
+The gradients w.r.t. this augmented input are then averaged and multiplied with the
+difference between the input and the reference input.
+"""
+struct InterpolationAugmentation{A<:AbstractXAIMethod} <: AbstractXAIMethod
+    analyzer::A
+    n::Int
+end
+
+function (aug::InterpolationAugmentation)(
+    input, ns::AbstractNeuronSelector, input_ref=zero(input)
+)
+    size(input) != size(input_ref) &&
+        throw(ArgumentError("Input reference size doesn't match input size."))
+
+    # Regular forward pass of model
+    output = aug.analyzer.model(input)
+    output_indices = ns(output)
+
+    # Call regular analyzer on augmented batch
+    augmented_input = interpolate_batch(input, input_ref, aug.n)
+    augmented_indices = augment_indices(output_indices, aug.n)
+    augmented_expl = aug.analyzer(augmented_input, AugmentationSelector(augmented_indices))
+
+    # Average gradients and compute explanation
+    expl = (input - input_ref) .* reduce_augmentation(augmented_expl.attribution, aug.n)
+
+    return Explanation(expl, output, output_indices, augmented_expl.analyzer, Nothing)
+end
+
+"""
+    interpolate_batch(x, x0, nsamples)
+
+Augment batch along batch dimension using linear interpolation between input `x` and a reference input `x0`.
+
+## Example
+```julia-repl
+julia> x = Float16.(reshape(1:4, 2, 2))
+2×2 Matrix{Float16}:
+ 1.0  3.0
+ 2.0  4.0
+
+julia> x0 = zero(x)
+2×2 Matrix{Float16}:
+ 0.0  0.0
+ 0.0  0.0
+
+julia> interpolate_batch(x, x0, 5)
+2×10 Matrix{Float16}:
+ 0.0  0.25  0.5  0.75  1.0  0.0  0.75  1.5  2.25  3.0
+ 0.0  0.5   1.0  1.5   2.0  0.0  1.0   2.0  3.0   4.0
+```
+"""
+function interpolate_batch(
+    x::AbstractArray{T,N}, x0::AbstractArray{T,N}, nsamples
+) where {T,N}
+    in_size = size(x)
+    outs = similar(x, (in_size[1:(end - 1)]..., in_size[end] * nsamples))
+    colons = ntuple(Returns(:), N - 1)
+    for (i, t) in enumerate(range(zero(T), oneunit(T); length=nsamples))
+        outs[colons..., i:nsamples:end] .= x0 + t * (x - x0)
+    end
+    return outs
+end
diff --git a/test/references/vgg11/IntegratedGradients.jld2 b/test/references/vgg11/IntegratedGradients.jld2
diff --git a/test/references/vgg11/IntegratedGradients_neuron_1.jld2 b/test/references/vgg11/IntegratedGradients_neuron_1.jld2
diff --git a/test/test_batches.jl b/test/test_batches.jl
@@ -26,7 +26,9 @@ ANALYZERS = Dict(
     "InputTimesGradient" => InputTimesGradient,
     "SmoothGrad" => m -> SmoothGrad(m, 5, 0.1, MersenneTwister(123)),
     "SmoothLRP" =>
-        m -> InputAugmentation(LRP(m), 2, Laplace(0.0f0, 0.1f0), MersenneTwister(123)),
+        m -> NoiseAugmentation(LRP(m), 2, Laplace(0.0f0, 0.1f0), MersenneTwister(123)),
+    "IntegratedGradients" => m -> IntegratedGradients(m, 5),
+    "IntegratedLRP" => m -> InterpolationAugmentation(LRP(m), 2),
 )
 
 for (name, method) in ANALYZERS
@@ -46,8 +48,8 @@ for (name, method) in ANALYZERS
         analyzer = method(model)
         expl_batch = analyzer(input_batch)
         @test expl1_bd.attribution ≈ expl_batch.attribution[:, 1]
-        if !(analyzer isa InputAugmentation)
-            # InputAugmentation methods generate random numbers for the entire batch.
+        if !(analyzer isa NoiseAugmentation)
+            # NoiseAugmentation methods generate random numbers for the entire batch.
             # therefore explanations don't match except for the first input in the batch.
             @test expl2_bd.attribution ≈ expl_batch.attribution[:, 2]
         end
diff --git a/test/test_input_augmentation.jl b/test/test_input_augmentation.jl
@@ -1,4 +1,5 @@
 using ExplainableAI: augment_batch_dim, augment_indices, reduce_augmentation
+using ExplainableAI: interpolate_batch
 
 # augment_batch_dim
 A = [1 2; 3 4]
@@ -40,3 +41,11 @@ R = @inferred reduce_augmentation(A, 5)
 A = Float64.(reshape(1:10, 1, 1, 1, 1, 10))
 R = @inferred reduce_augmentation(A, 2)
 @test R == reshape([3, 7, 11, 15, 19] / 2, 1, 1, 1, 1, :)
+
+x = Float16.(reshape(1:4, 2, 2))
+x0 = zero(x)
+A = @inferred interpolate_batch(x, x0, 5)
+@test A ≈ [
+    0.0 0.25 0.5 0.75 1.0 0.0 0.75 1.5 2.25 3.0
+    0.0 0.5 1.0 1.5 2.0 0.0 1.0 2.0 3.0 4.0
+]
diff --git a/test/test_vgg11.jl b/test/test_vgg11.jl
@@ -6,6 +6,7 @@ const GRADIENT_ANALYZERS = Dict(
     "Gradient" => Gradient,
     "InputTimesGradient" => InputTimesGradient,
     "SmoothGrad" => model -> SmoothGrad(model, 5, 0.1, MersenneTwister(123)),
+    "IntegratedGradients" => model -> IntegratedGradients(model, 5),
 )
 const LRP_ANALYZERS = Dict(
     "LRPZero" => LRPZero, "LRPEpsilon" => LRPEpsilon, "LRPGamma" => LRPGamma

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ algs = Dict(`
`22`	`22`	`"LRPZero" => LRPZero,`
`23`	`23`	`"LRPCustom" => LRPCustom, #modifies weights`
`24`	`24`	`"SmoothGrad" => model -> SmoothGrad(model, 10),`
	`25`	`+ "IntegratedGradients" => model -> IntegratedGradients(model, 10),`
`25`	`26`	`)`
`26`	`27`
`27`	`28`	`# Define benchmark`