diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index 70827d0aa..01a6e61be 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -17,11 +17,11 @@ Creates a ConvMixer model.
 function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
                    patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
   stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1])
-  blocks = [Chain(SkipConnection(conv_bn(kernel_size, planes, planes, activation;
-                                               preact = true, groups = planes, pad = SamePad()), +),
-                  conv_bn((1, 1), planes, planes, activation; preact = true)) for _ in 1:depth]
+  blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
+                                               preact = true, groups = planes, pad = SamePad())), +),
+                  conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth]
   head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
-  return Chain(Chain(stem, Chain(blocks)), head)
+  return Chain(Chain(stem..., Chain(blocks)), head)
 end
 
 convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9),
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index eff19f1a8..53d96df09 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -11,8 +11,8 @@ Create a Densenet bottleneck layer
 """
 function dense_bottleneck(inplanes, outplanes)
   inner_channels = 4 * outplanes
-  m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true),
-            conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true))
+  m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
+            conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...)
 
   SkipConnection(m, cat_channels)
 end
@@ -28,7 +28,7 @@ Create a DenseNet transition sequence
 - `outplanes`: number of output feature maps
 """
 transition(inplanes, outplanes) =
-  Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true), MeanPool((2, 2)))
+  Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2)))
 
 """
     dense_block(inplanes, growth_rates)
@@ -60,7 +60,7 @@ Create a DenseNet model
 """
 function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
   layers = []
-  push!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
   push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1)))
 
   outplanes = 0
diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl
index 00bdd0ccb..ef8ab81ef 100644
--- a/src/convnets/inception.jl
+++ b/src/convnets/inception.jl
@@ -9,17 +9,17 @@ Create an Inception-v3 style-A module
 - `pool_proj`: the number of output feature maps for the pooling projection
 """
 function inception_a(inplanes, pool_proj)
-  branch1x1 = conv_bn((1, 1), inplanes, 64)
+  branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
 
-  branch5x5 = Chain(conv_bn((1, 1), inplanes, 48),
-                    conv_bn((5, 5), 48, 64; pad = 2))
+  branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
+                    conv_bn((5, 5), 48, 64; pad = 2)...)
 
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 64),
-                    conv_bn((3, 3), 64, 96; pad = 1),
-                    conv_bn((3, 3), 96, 96; pad = 1))
+  branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                    conv_bn((3, 3), 64, 96; pad = 1)...,
+                    conv_bn((3, 3), 96, 96; pad = 1)...)
 
   branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, pool_proj))
+                      conv_bn((1, 1), inplanes, pool_proj)...)
 
   return Parallel(cat_channels,
                   branch1x1, branch5x5, branch3x3, branch_pool)
@@ -35,11 +35,11 @@ Create an Inception-v3 style-B module
 - `inplanes`: number of input feature maps
 """
 function inception_b(inplanes)
-  branch3x3_1 = conv_bn((3, 3), inplanes, 384; stride = 2)
+  branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
 
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64),
-                      conv_bn((3, 3), 64, 96; pad = 1),
-                      conv_bn((3, 3), 96, 96; stride = 2))
+  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                      conv_bn((3, 3), 64, 96; pad = 1)...,
+                      conv_bn((3, 3), 96, 96; stride = 2)...)
 
   branch_pool = MaxPool((3, 3), stride = 2)
 
@@ -59,20 +59,20 @@ Create an Inception-v3 style-C module
 - `n`: the "grid size" (kernel size) for the convolution layers
 """
 function inception_c(inplanes, inner_planes, n = 7)
-  branch1x1 = conv_bn((1, 1), inplanes, 192)
+  branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
 
-  branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes),
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3)),
-                      conv_bn((n, 1), inner_planes, 192; pad = (3, 0)))
+  branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                      conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
 
-  branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes),
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0)),
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3)),
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0)),
-                      conv_bn((1, n), inner_planes, 192; pad = (0, 3)))
+  branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                      conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
 
   branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1),
-                      conv_bn((1, 1), inplanes, 192))
+                      conv_bn((1, 1), inplanes, 192)...)
 
   return Parallel(cat_channels,
                   branch1x1, branch7x7_1, branch7x7_2, branch_pool)
@@ -88,13 +88,13 @@ Create an Inception-v3 style-D module
 - `inplanes`: number of input feature maps
 """
 function inception_d(inplanes)
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 192),
-                    conv_bn((3, 3), 192, 320; stride = 2))
+  branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                    conv_bn((3, 3), 192, 320; stride = 2)...)
 
-  branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192),
-                      conv_bn((1, 7), 192, 192; pad = (0, 3)),
-                      conv_bn((7, 1), 192, 192; pad = (3, 0)),
-                      conv_bn((3, 3), 192, 192; stride = 2))
+  branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                      conv_bn((1, 7), 192, 192; pad = (0, 3))...,
+                      conv_bn((7, 1), 192, 192; pad = (3, 0))...,
+                      conv_bn((3, 3), 192, 192; stride = 2)...)
 
   branch_pool = MaxPool((3, 3), stride=2)
 
@@ -112,19 +112,19 @@ Create an Inception-v3 style-E module
 - `inplanes`: number of input feature maps
 """
 function inception_e(inplanes)
-  branch1x1 = conv_bn((1, 1), inplanes, 320)
+  branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
 
-  branch3x3_1 = conv_bn((1, 1), inplanes, 384)
-  branch3x3_1a = conv_bn((1, 3), 384, 384; pad = (0, 1))
-  branch3x3_1b = conv_bn((3, 1), 384, 384; pad = (1, 0))
+  branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
+  branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+  branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
 
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448),
-                      conv_bn((3, 3), 448, 384; pad = 1))
-  branch3x3_2a = conv_bn((1, 3), 384, 384; pad = (0, 1))
-  branch3x3_2b = conv_bn((3, 1), 384, 384; pad = (1, 0))
+  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
+                      conv_bn((3, 3), 448, 384; pad = 1)...)
+  branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+  branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
 
   branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, 192))
+                      conv_bn((1, 1), inplanes, 192)...)
 
   return Parallel(cat_channels,
                   branch1x1,
@@ -150,12 +150,12 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
     `inception3` does not currently support pretrained weights.
 """
 function inception3(; nclasses = 1000)
-  layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2),
-                      conv_bn((3, 3), 32, 32),
-                      conv_bn((3, 3), 32, 64; pad = 1),
+  layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
+                      conv_bn((3, 3), 32, 32)...,
+                      conv_bn((3, 3), 32, 64; pad = 1)...,
                       MaxPool((3, 3), stride = 2),
-                      conv_bn((1, 1), 64, 80),
-                      conv_bn((3, 3), 80, 192),
+                      conv_bn((1, 1), 64, 80)...,
+                      conv_bn((3, 3), 80, 192)...,
                       MaxPool((3, 3), stride = 2),
                       inception_a(192, 32),
                       inception_a(256, 64),
diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl
index 186726ef9..1d0a6227e 100644
--- a/src/convnets/mobilenet.jl
+++ b/src/convnets/mobilenet.jl
@@ -34,7 +34,7 @@ function mobilenetv1(width_mult, config;
       layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
                                          stride = stride, pad = 1) :
                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
-      push!(layers, layer)
+      append!(layers, layer)
       inchannels = outch
     end
   end
@@ -118,7 +118,7 @@ function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
   # building first layer
   inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
   layers = []
-  push!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
+  append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
 
   # building inverted residual blocks
   for (t, c, n, s, a) in configs
@@ -134,7 +134,7 @@ function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
   outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
                                  max_width
 
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)),
+  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...),
                Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses)))
 end
 
@@ -211,7 +211,7 @@ function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
   # building first layer
   inplanes = _round_channels(16 * width_mult, 8)
   layers = []
-  push!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
+  append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
   explanes = 0
   # building inverted residual blocks
   for (k, t, c, r, a, s) in configs
@@ -230,7 +230,7 @@ function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
                      Dropout(0.2),
                      Dense(output_channel, nclasses))
 
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)),
+  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...),
                Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
 end
 
diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl
index 5de0e35dd..72d2de3fb 100644
--- a/src/convnets/resnet.jl
+++ b/src/convnets/resnet.jl
@@ -12,8 +12,8 @@ Create a basic residual block
 """
 function basicblock(inplanes, outplanes, downsample = false)
   stride = downsample ? 2 : 1
-  Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false),
-        conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false))
+  Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)...,
+        conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...)
 end
 
 """
@@ -36,12 +36,11 @@ The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead.
 """
 function bottleneck(inplanes, outplanes, downsample = false;
                     stride = [1, (downsample ? 2 : 1), 1])
-  Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false),
-        conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false),
-        conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false))
+  Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)...,
+        conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)...,
+        conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...)
 end
 
-
 """
     bottleneck_v1(inplanes, outplanes, downsample = false)
 
@@ -82,7 +81,7 @@ function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection =
   inplanes = 64
   baseplanes = 64
   layers = []
-  push!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
+  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
   push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
   for (i, nrepeats) in enumerate(block_config)
     # output planes within a block
diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl
index c9d7aa669..53ff60c95 100644
--- a/src/convnets/resnext.jl
+++ b/src/convnets/resnext.jl
@@ -14,10 +14,10 @@ Create a basic residual block as defined in the paper for ResNeXt
 function resnextblock(inplanes, outplanes, cardinality, width, downsample = false)
   stride = downsample ? 2 : 1
   hidden_channels = cardinality * width
-  return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false),
+  return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
                conv_bn((3, 3), hidden_channels, hidden_channels;
-                        stride = stride, pad = 1, bias = false, groups = cardinality),
-               conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false))
+                        stride = stride, pad = 1, bias = false, groups = cardinality)...,
+               conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
 end
 
 """
@@ -40,7 +40,7 @@ function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @.
   inplanes = 64
   baseplanes = 128
   layers = []
-  push!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
+  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
   push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
   for (i, nrepeats) in enumerate(block_config)
     # output planes within a block
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index 6cc9dab83..3cfb2dc12 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -16,7 +16,7 @@ function vgg_block(ifilters, ofilters, depth, batchnorm)
   layers = []
   for _ in 1:depth
     if batchnorm
-      push!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
+      append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
     else
       push!(layers, Conv(k, ifilters => ofilters, relu, pad = p))
     end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 78b729c01..ca30df8a4 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -45,7 +45,7 @@ function conv_bn(kernelsize, inplanes, outplanes, activation = relu;
   push!(layers, BatchNorm(Int(bnplanes), activations.bn;
                           initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
 
-  return rev ? Chain(reverse(layers)) : Chain(layers)
+  return rev ? reverse(layers) : layers
 end
 
 """
@@ -82,13 +82,13 @@ depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
                       initβ = Flux.zeros32, initγ = Flux.ones32,
                       ϵ = 1f-5, momentum = 1f-1,
                       stride = 1, kwargs...) =
-  Chain(vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
-                     rev = rev, initβ = initβ, initγ = initγ,
-                     ϵ = ϵ, momentum = momentum,
-                     stride = stride, groups = Int(inplanes), kwargs...),
-             conv_bn((1, 1), inplanes, outplanes, activation;
-                     rev = rev, initβ = initβ, initγ = initγ,
-                     ϵ = ϵ, momentum = momentum)))
+  vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
+               rev = rev, initβ = initβ, initγ = initγ,
+               ϵ = ϵ, momentum = momentum,
+               stride = stride, groups = Int(inplanes), kwargs...),
+      conv_bn((1, 1), inplanes, outplanes, activation;
+              rev = rev, initβ = initβ, initγ = initγ,
+              ϵ = ϵ, momentum = momentum))
 
 """
     skip_projection(inplanes, outplanes, downsample = false)
@@ -102,8 +102,8 @@ Create a skip projection
 - `downsample`: set to `true` to downsample the input
 """
 skip_projection(inplanes, outplanes, downsample = false) = downsample ?
-  conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false) :
-  conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)
+  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
+  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
 
 # array -> PaddedView(0, array, outplanes) for zero padding arrays
 """
@@ -144,8 +144,8 @@ Squeeze and excitation layer used by MobileNet variants
 function squeeze_excite(channels, reduction = 4)
   @assert (reduction >= 1) "`reduction` must be >= 1"
   SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
-                       conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false),
-                       conv_bn((1, 1), channels ÷ reduction, channels, hardσ)), .*)
+                       conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)...,
+                       conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*)
 end
 
 """
@@ -171,14 +171,14 @@ function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activ
   @assert stride in [1, 2] "`stride` has to be 1 or 2"
 
   pad = @. (kernel_size - 1) ÷ 2
-  conv1 = (inplanes == hidden_planes) ? identity : conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)
+  conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
   selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
 
   invres = Chain(conv1,
                  conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
-                         bias = false, stride, pad = pad, groups = hidden_planes),
+                         bias = false, stride, pad = pad, groups = hidden_planes)...,
                  selayer,
-                 conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false))
+                 conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
 
   (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
 end