Add model canonization (#51)

adrhill · web-flow · commit e7346fd735a5 · 2022-04-05T21:41:17.000+02:00
* Add network canonization
* Move types to new file
* Refactor `strip_softmax`
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -14,7 +14,7 @@ SmoothGrad
 ```
 
 `SmoothGrad` is a special case of `InputAugmentation`, which can be applied as a wrapper to any analyzer:
-```@doc
+```@docs
 InputAugmentation
 ```
 
@@ -41,6 +41,7 @@ LRP_CONFIG.supports_activation
 ```@docs
 strip_softmax
 flatten_model
+canonize
 ```
 
 # Index
diff --git a/src/ExplainableAI.jl b/src/ExplainableAI.jl
@@ -18,8 +18,10 @@ using PrettyTables
 
 include("neuron_selection.jl")
 include("analyze_api.jl")
+include("types.jl")
 include("flux.jl")
 include("utils.jl")
+include("canonize.jl")
 include("input_augmentation.jl")
 include("gradient.jl")
 include("lrp_checks.jl")
@@ -46,6 +48,6 @@ export check_model
 export heatmap
 
 # utils
-export strip_softmax, flatten_model, flatten_chain
+export strip_softmax, flatten_model, flatten_chain, canonize
 
 end # module
diff --git a/src/canonize.jl b/src/canonize.jl
@@ -0,0 +1,51 @@
+function fuse_batchnorm(d::Dense, bn::BatchNorm)
+    d.σ != identity &&
+        throw(ArgumentError("Can't fuse Dense layer with activation $(d.σ)."))
+    scale = safedivide(bn.γ, sqrt.(bn.σ²))
+    W = scale .* d.weight
+    b = scale .* (d.bias - bn.μ) + bn.β
+    return Dense(W, b, bn.λ)
+end
+
+function fuse_batchnorm(c::Conv, bn::BatchNorm)
+    c.σ != identity && throw(ArgumentError("Can't fuse Conv layer with activation $(c.σ)."))
+    scale = safedivide(bn.γ, sqrt.(bn.σ²))
+    W = c.weight .* reshape(scale, 1, 1, 1, :)
+    b = scale .* (c.bias - bn.μ) + bn.β
+    return Conv(W, b, bn.λ)
+end
+
+"""
+    try_fusing(model, i)
+
+Attempt to fuse pair of model layers at indices `i` and `i+1`.
+Returns fused model and `true` if layers were fused, unmodified model and `false` otherwise.
+"""
+function try_fusing(model, i)
+    l1 = model[i]
+    l2 = model[i + 1]
+    if l1 isa Union{Dense,Conv} && l2 isa BatchNorm && activation(l1) == identity
+        if i == length(model) - 1
+            model = Chain(model[1:(i - 1)]..., fuse_batchnorm(l1, l2))
+        end
+        model = Chain(model[1:(i - 1)]..., fuse_batchnorm(l1, l2), model[(i + 2):end]...)
+        return model, true
+    end
+    return model, false
+end
+
+"""
+    canonize(model)
+
+Canonize model by flattening it and fusing BatchNorm layers into preceding Dense and Conv
+layers with linear activation functions.
+"""
+function canonize(model::Chain)
+    model = flatten_model(model)
+    i = 1
+    while i < length(model)
+        model, fused = try_fusing(model, i)
+        !fused && (i += 1)
+    end
+    return model
+end
diff --git a/src/flux.jl b/src/flux.jl
@@ -1,21 +1,14 @@
-## Group layers by type:
-const ConvLayer = Union{Conv} # TODO: DepthwiseConv, ConvTranspose, CrossCor
-const DropoutLayer = Union{Dropout,typeof(Flux.dropout),AlphaDropout}
-const ReshapingLayer = Union{typeof(Flux.flatten)}
-# Pooling layers
-const MaxPoolLayer = Union{MaxPool,AdaptiveMaxPool,GlobalMaxPool}
-const MeanPoolLayer = Union{MeanPool,AdaptiveMeanPool,GlobalMeanPool}
-const PoolingLayer = Union{MaxPoolLayer,MeanPoolLayer}
-# Activation functions that are similar to ReLU
-const ReluLikeActivation = Union{
-    typeof(relu),typeof(gelu),typeof(swish),typeof(softplus),typeof(mish)
-}
-# Layers & activation functions supported by LRP
-const LRPSupportedLayer = Union{Dense,ConvLayer,DropoutLayer,ReshapingLayer,PoolingLayer}
-const LRPSupportedActivation = Union{typeof(identity),ReluLikeActivation}
+"""
+activation(layer)
+
+Return activation function of the layer.
+In case the layer is unknown or no activation function is found, `nothing` is returned.
+"""
+activation(l::Dense) = l.σ
+activation(l::Conv) = l.σ
+activation(l::BatchNorm) = l.λ
+activation(layer) = nothing # default for all other layer types
 
-_flatten_model(x) = x
-_flatten_model(c::Chain) = [c.layers...]
 """
     flatten_model(c)
 
@@ -30,8 +23,11 @@ function flatten_model(chain::Chain)
 end
 @deprecate flatten_chain(c) flatten_model(c)
 
-is_softmax(layer) = layer isa Union{typeof(softmax),typeof(softmax!)}
-has_output_softmax(x) = is_softmax(x)
+_flatten_model(x) = x
+_flatten_model(c::Chain) = [c.layers...]
+
+is_softmax(x) = x isa SoftmaxActivation
+has_output_softmax(x) = is_softmax(x) || is_softmax(activation(x))
 has_output_softmax(model::Chain) = has_output_softmax(model[end])
 
 """
@@ -56,10 +52,14 @@ Remove softmax activation on model output if it exists.
 function strip_softmax(model::Chain)
     if has_output_softmax(model)
         model = flatten_model(model)
-        return Chain(model.layers[1:(end - 1)]...)
+        if is_softmax(model[end])
+            return Chain(model.layers[1:(end - 1)]...)
+        end
+        return Chain(model.layers[1:(end - 1)]..., strip_softmax(model[end]))
     end
     return model
 end
+strip_softmax(l::Union{Dense,Conv}) = set_params(l, l.weight, l.bias, identity)
 
 # helper function to work around Flux.Zeros
 function get_params(layer)
@@ -76,5 +76,5 @@ end
 
 Duplicate layer using weights W, b.
 """
-set_params(l::Conv, W, b) = Conv(l.σ, W, b, l.stride, l.pad, l.dilation, l.groups)
-set_params(l::Dense, W, b) = Dense(W, b, l.σ)
+set_params(l::Conv, W, b, σ=l.σ) = Conv(σ, W, b, l.stride, l.pad, l.dilation, l.groups)
+set_params(l::Dense, W, b, σ=l.σ) = Dense(W, b, σ)
diff --git a/src/types.jl b/src/types.jl
@@ -0,0 +1,34 @@
+## Layer types
+"""Union type for convolutional layers."""
+const ConvLayer = Union{Conv} # TODO: DepthwiseConv, ConvTranspose, CrossCor
+
+"""Union type for dropout layers."""
+const DropoutLayer = Union{Dropout,typeof(Flux.dropout),AlphaDropout}
+
+"""Union type for reshaping layers such as `flatten`."""
+const ReshapingLayer = Union{typeof(Flux.flatten)}
+
+"""Union type for max pooling layers."""
+const MaxPoolLayer = Union{MaxPool,AdaptiveMaxPool,GlobalMaxPool}
+
+"""Union type for mean pooling layers."""
+const MeanPoolLayer = Union{MeanPool,AdaptiveMeanPool,GlobalMeanPool}
+
+"""Union type for pooling layers."""
+const PoolingLayer = Union{MaxPoolLayer,MeanPoolLayer}
+
+# Activation functions
+"""Union type for ReLU-like activation functions."""
+const ReluLikeActivation = Union{
+    typeof(relu),typeof(gelu),typeof(swish),typeof(softplus),typeof(mish)
+}
+
+"""Union type for softmax activation functions."""
+const SoftmaxActivation = Union{typeof(softmax),typeof(softmax!)}
+
+# Layers & activation functions supported by LRP
+"""Union type for layers that are allowed by default in "deep rectifier networks"."""
+const LRPSupportedLayer = Union{Dense,ConvLayer,DropoutLayer,ReshapingLayer,PoolingLayer}
+
+"""Union type for activation functions that are allowed by default in "deep rectifier networks"."""
+const LRPSupportedActivation = Union{typeof(identity),ReluLikeActivation}
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -19,6 +19,10 @@ using ReferenceTests
         println("Running tests on heatmaps...")
         include("test_heatmaps.jl")
     end
+    @testset "Canonize" begin
+        println("Running tests on model canonization...")
+        include("test_canonize.jl")
+    end
     @testset "LRP model checks" begin
         println("Running tests on LRP model checks...")
         include("test_checks.jl")
diff --git a/test/test_canonize.jl b/test/test_canonize.jl
@@ -0,0 +1,67 @@
+using Flux
+using ExplainableAI
+using ExplainableAI: fuse_batchnorm
+using Random
+
+pseudorand(dims...) = rand(MersenneTwister(123), Float32, dims...)
+batchsize = 50
+
+# # Test `fuse_batchnorm` on Dense layer
+ins = 20
+outs = 10
+dense = Dense(ins, outs; init=pseudorand)
+bn_dense = BatchNorm(outs, relu; initβ=pseudorand, initγ=pseudorand)
+model = Chain(dense, bn_dense)
+
+# collect statistics
+x = pseudorand(ins, batchsize)
+Flux.trainmode!(model)
+model(x)
+Flux.testmode!(model)
+
+dense_fused = @inferred fuse_batchnorm(dense, bn_dense)
+@test dense_fused(x) ≈ model(x)
+
+# # Test `fuse_batchnorm` on Conv layer
+insize = (10, 10, 3)
+conv = Conv((3, 3), 3 => 4; init=pseudorand)
+bn_conv = BatchNorm(4, relu; initβ=pseudorand, initγ=pseudorand)
+model = Chain(conv, bn_conv)
+
+# collect statistics
+x = pseudorand(insize..., batchsize)
+Flux.trainmode!(model)
+model(x)
+Flux.testmode!(model)
+
+conv_fused = @inferred fuse_batchnorm(conv, bn_conv)
+@test conv_fused(x) ≈ model(x)
+
+# # Test `canonize` on models
+# Sequential BatchNorm layers should be fused until they create a Dense or Conv layer
+# with non-linear activation function.
+model = Chain(
+    Conv((3, 3), 3 => 6),
+    BatchNorm(6),
+    Conv((3, 3), 6 => 2, identity),
+    BatchNorm(2),
+    BatchNorm(2, softplus),
+    BatchNorm(2),
+    flatten,
+    Dense(72, 10),
+    BatchNorm(10),
+    BatchNorm(10),
+    BatchNorm(10, relu),
+    BatchNorm(10),
+    Dense(10, 10, gelu),
+    BatchNorm(10),
+    softmax,
+)
+Flux.trainmode!(model)
+model(x)
+Flux.testmode!(model)
+model_canonized = canonize(model)
+
+# 6 of the BatchNorm layers should be removed and the ouputs should match
+@test length(model_canonized) == length(model) - 6
+@test model(x) ≈ model_canonized(x)
diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -1,6 +1,15 @@
 using Flux
-using ExplainableAI: flatten_model, has_output_softmax, check_output_softmax
+using ExplainableAI: flatten_model, has_output_softmax, check_output_softmax, activation
 using ExplainableAI: stabilize_denom, batch_dim_view, drop_batch_index
+using Random
+
+pseudorand(dims...) = rand(MersenneTwister(123), Float32, dims...)
+
+# Test `activation`
+@test activation(Dense(5, 2, gelu)) == gelu
+@test activation(Conv((5, 5), 3 => 2, softplus)) == softplus
+@test activation(BatchNorm(5, selu)) == selu
+@test isnothing(activation(flatten))
 
 # flatten_model
 @test flatten_model(Chain(Chain(Chain(abs)), sqrt, Chain(relu))) == Chain(abs, sqrt, relu)
@@ -12,14 +21,31 @@ using ExplainableAI: stabilize_denom, batch_dim_view, drop_batch_index
 @test has_output_softmax(Chain(abs, sqrt, relu, tanh)) == false
 @test has_output_softmax(Chain(Chain(abs), sqrt, Chain(Chain(softmax)))) == true
 @test has_output_softmax(Chain(Chain(abs), Chain(Chain(softmax)), sqrt)) == false
+@test has_output_softmax(Chain(Dense(5, 5, softmax), Dense(5, 5, softmax))) == true
+@test has_output_softmax(Chain(Dense(5, 5, softmax), Dense(5, 5, relu))) == false
+@test has_output_softmax(Chain(Dense(5, 5, softmax), Chain(Dense(5, 5, softmax)))) == true
+@test has_output_softmax(Chain(Dense(5, 5, softmax), Chain(Dense(5, 5, relu)))) == false
 
 # check_output_softmax
 @test_throws ArgumentError check_output_softmax(Chain(abs, sqrt, relu, softmax))
 
 # strip_softmax
-@test strip_softmax(Chain(Chain(abs), sqrt, Chain(Chain(softmax)))) == Chain(abs, sqrt) # flatten to remove softmax
+d_softmax = Dense(2, 2, softmax; init=pseudorand)
+d_softmax2 = Dense(2, 2, softmax; init=pseudorand)
+d_relu = Dense(2, 2, relu; init=pseudorand)
+d_identity = Dense(2, 2; init=pseudorand)
+# flatten to remove softmax
+m = strip_softmax(Chain(Chain(abs), sqrt, Chain(Chain(softmax))))
+@test m == Chain(abs, sqrt)
+m1 = strip_softmax(Chain(d_relu, Chain(d_softmax)))
+m2 = Chain(d_relu, d_identity)
+x = rand(Float32, 2, 10)
+@test typeof(m1) == typeof(m2)
+@test m1(x) == m2(x)
+# don't do anything if there is no softmax at the end
 @test strip_softmax(Chain(Chain(abs), Chain(Chain(softmax)), sqrt)) ==
-    Chain(Chain(abs), Chain(Chain(softmax)), sqrt) # don't do anything if there is no softmax at the end
+    Chain(Chain(abs), Chain(Chain(softmax)), sqrt)
+@test strip_softmax(Chain(d_softmax, Chain(d_relu))) == Chain(d_softmax, Chain(d_relu))
 
 # stabilize_denom
 A = [1.0 0.0 1.0e-25; -1.0 -0.0 -1.0e-25]