Add tests for inference and remove timeroutputs

Avik Pal · Avik Pal · commit 5449ebca57ef · 2019-06-13T16:52:35.000+05:30
diff --git a/src/impl/conv_direct.jl b/src/impl/conv_direct.jl
@@ -44,7 +44,7 @@ wrapper methods are available.
 """
 conv_direct!
 
-@timeit_debug to function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
+function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
                       w::AbstractArray{wT,5}, cdims::DenseConvDims;
                       alpha::yT = yT(1), beta = false) where {yT, xT, wT}
     check_dims(size(x), size(w), size(y), cdims)
@@ -114,7 +114,7 @@ Calculate the gradient imposed upon `x` in the convolution `y = x * w`.
 """
 ∇conv_data_direct!
 
-@timeit_debug to function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
+function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
                             w::AbstractArray{wT,5}, cdims::DenseConvDims;
                             alpha::xT=xT(1), beta=false) where {xT, yT, wT}
     w = transpose_swapbatch(w[end:-1:1, end:-1:1, end:-1:1, :, :])
@@ -133,7 +133,7 @@ Calculate the gradient imposed upon `w` in the convolution `y = x * w`.
 """
 ∇conv_filter_direct!
 
-@timeit_debug to function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
+function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
                               dy::AbstractArray{yT,5}, cdims::DenseConvDims;
                               alpha::wT=wT(1), beta=false) where {xT, yT, wT}
     x = transpose_swapbatch(x[end:-1:1, end:-1:1, end:-1:1, :, :])
diff --git a/src/impl/conv_im2col.jl b/src/impl/conv_im2col.jl
@@ -22,7 +22,7 @@ by setting `alpha` to a nonunitary value, various gain factors can be applied.
 Note for the particularly performance-minded, you can provide a pre-allocated `col`,
 which should eliminate any need for large allocations within this method.
 """
-@timeit_debug to function conv_im2col!(
+function conv_im2col!(
                 y::AbstractArray{T,5}, x::AbstractArray{T,5},
                 w::AbstractArray{T,5}, cdims::DenseConvDims;
                 col::AbstractArray{T,2}=similar(x, im2col_dims(cdims)),
@@ -49,12 +49,12 @@ which should eliminate any need for large allocations within this method.
     @inbounds for batch_idx in 1:size(x,5)
         # We invoke `@timeit_debug` on the outside of `im2col!()` because inference
         # doesn't like us putting it on the inside.
-        @timeit_debug to "im2col!" im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
+        im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
         GC.@preserve col, w, y, begin
             col_ptr = pointer(col)
             w_ptr = pointer(w)
             y_ptr = pointer(y, (batch_idx - 1)*M*N + 1)
-            @timeit_debug to "gemm!" gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
+            gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
         end
     end
     return y
@@ -66,7 +66,7 @@ end
 Conv backward pass onto the weights using im2col and GEMM; stores the result in `dw`.
 See the documentation for `conv_im2col!()` for explanation of optional parameters.
 """
-@timeit_debug to function ∇conv_filter_im2col!(
+function ∇conv_filter_im2col!(
                 dw::AbstractArray{T,5}, x::AbstractArray{T,5},
                 dy::AbstractArray{T,5}, cdims::DenseConvDims;
                 col::AbstractArray{T,2} = similar(dw, im2col_dims(cdims)),
@@ -95,14 +95,12 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
     K = prod(output_size(cdims))
     
     @inbounds for batch_idx in 1:size(x,5)
-        # We invoke `@timeit_debug` on the outside of `im2col!()` because inference
-        # doesn't like us putting it on the inside.
-        @timeit_debug to "im2col!" im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
+        im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
         GC.@preserve col, dw, dy, begin
             col_ptr = pointer(col)
             dy_ptr = pointer(dy,(batch_idx - 1)*K*N + 1)
             dw_ptr = pointer(dw)
-            @timeit_debug to "gemm!" gemm!(Val(true), Val(false), M, N, K, alpha, col_ptr, dy_ptr, beta, dw_ptr)
+            gemm!(Val(true), Val(false), M, N, K, alpha, col_ptr, dy_ptr, beta, dw_ptr)
         end
 
         # Because we accumulate over batches in this loop, we must set `beta` equal
@@ -118,7 +116,7 @@ end
 Conv2d backward pass onto the input using im2col and GEMM; stores the result in `dx`.
 See the documentation for `conv_im2col!()` for explanation of other parameters.
 """
-@timeit_debug to function ∇conv_data_im2col!(
+function ∇conv_data_im2col!(
                 dx::AbstractArray{T,5}, dy::AbstractArray{T,5},
                 w::AbstractArray{T,5}, cdims::DenseConvDims;
                 col::AbstractArray{T,2} = similar(dx, im2col_dims(cdims)),
@@ -149,9 +147,9 @@ See the documentation for `conv_im2col!()` for explanation of other parameters.
             dy_ptr = pointer(dy, (batch_idx - 1)*M*K + 1)
             w_ptr = pointer(w)
             col_ptr = pointer(col)
-            @timeit_debug to "gemm!" gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
+            gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
         end
-        @timeit_debug to "col2im!" col2im!(view(dx, :, :, :, :, batch_idx), col, cdims)
+        col2im!(view(dx, :, :, :, :, batch_idx), col, cdims)
     end
     return dx
 end
@@ -207,77 +205,74 @@ function im2col!(col::AbstractArray{T,2}, x::AbstractArray{T,4},
     # We begin by copying the central region of the image which requires no padding at all.
     # Eliminating the branches of the fully generalized version below gives us a nice
     # speedup on the majority of the data.
-    @timeit_debug to "im2col!() - central region" begin
-        @inbounds for c in 1:C_in
-            # Unpack "central region"
-            w_region, h_region, d_region = central_region
-
-            for kd in 1:kernel_d,
-                kh in 1:kernel_h,
-                kw in 1:kernel_w,
-                d in d_region,
-                h in h_region,
-                w in w_region
-
-                input_kd = project(d, stride_d, pad_d_lo) + (kd - 1)*dil_d
-                input_kh = project(h, stride_h, pad_h_lo) + (kh - 1)*dil_h
-                input_kw = project(w, stride_w, pad_w_lo) + (kw - 1)*dil_w
-                kidxs = kernel_index(kw, kh, kd, cdims)
+    @inbounds for c in 1:C_in
+        # Unpack "central region"
+        w_region, h_region, d_region = central_region
 
-                xval::T = x[input_kw, input_kh, input_kd, c]
-                col_reshaped[w, h, d, kidxs..., c] = xval
-            end
+        for kd in 1:kernel_d,
+            kh in 1:kernel_h,
+            kw in 1:kernel_w,
+            d in d_region,
+            h in h_region,
+            w in w_region
+ 
+            input_kd = project(d, stride_d, pad_d_lo) + (kd - 1)*dil_d
+            input_kh = project(h, stride_h, pad_h_lo) + (kh - 1)*dil_h
+            input_kw = project(w, stride_w, pad_w_lo) + (kw - 1)*dil_w
+            kidxs = kernel_index(kw, kh, kd, cdims)
+
+            xval::T = x[input_kw, input_kh, input_kd, c]
+            col_reshaped[w, h, d, kidxs..., c] = xval
         end
     end
     
+    
     # For each "padded region", we run the fully general version
-    @timeit_debug to "im2col!() - padded region" begin
-        @inbounds for (w_region, h_region, d_region) in padded_regions
-            for c in 1:C_in,
-                d in d_region,
-                h in h_region,
-                w in w_region,
-                kd in 1:kernel_d,
-                kh in 1:kernel_h,
-                kw in 1:kernel_w
+    @inbounds for (w_region, h_region, d_region) in padded_regions
+        for c in 1:C_in,
+            d in d_region,
+            h in h_region,
+            w in w_region,
+            kd in 1:kernel_d,
+            kh in 1:kernel_h,
+            kw in 1:kernel_w
 
-                input_kd = project(d, stride_d, pad_d_lo) + (kd - 1)*dil_d
-                input_kh = project(h, stride_h, pad_h_lo) + (kh - 1)*dil_h
-                input_kw = project(w, stride_w, pad_w_lo) + (kw - 1)*dil_w
+            input_kd = project(d, stride_d, pad_d_lo) + (kd - 1)*dil_d
+            input_kh = project(h, stride_h, pad_h_lo) + (kh - 1)*dil_h
+            input_kw = project(w, stride_w, pad_w_lo) + (kw - 1)*dil_w
 
-                kidxs = kernel_index(kw, kh, kd, cdims)
+            kidxs = kernel_index(kw, kh, kd, cdims)
 
-                # If this d is off the edge, then deal with the entire plane
-                # in one fell swoop, like a ravenous flock of crows.  CAW CAW.
-                if input_kd <= 0 || input_kd > depth
-                    for kh in 1:kernel_h,
-                        kw in 1:kernel_w
-                        col_reshaped[w, h, d, kidxs..., c] = T(0)
-                    end
-                    continue
-                end
-
-                # Same for `h`, but in this case it's only a line, not a plane.
-                # This results in slightly less caw'ing.
-                if input_kh <= 0 || input_kh > height
-                    for kw in 1:kernel_w
-                        col_reshaped[w, h, d, kidxs..., c] = T(0)
-                    end
-                    continue
+            # If this d is off the edge, then deal with the entire plane
+            # in one fell swoop, like a ravenous flock of crows.  CAW CAW.
+            if input_kd <= 0 || input_kd > depth
+                for kh in 1:kernel_h,
+                    kw in 1:kernel_w
+                    col_reshaped[w, h, d, kidxs..., c] = T(0)
                 end
+                continue
+            end
 
-                # If this `w` is off the edge it and only it gets cleared out
-                if input_kw <= 0 || input_kw > width
+            # Same for `h`, but in this case it's only a line, not a plane.
+            # This results in slightly less caw'ing.
+            if input_kh <= 0 || input_kh > height
+                for kw in 1:kernel_w
                     col_reshaped[w, h, d, kidxs..., c] = T(0)
-                    continue
                 end
+                continue
+            end
 
-                # Copy the data over
-                xval::T = x[input_kw, input_kh, input_kd, c]
-                col_reshaped[w, h, d, kidxs..., c] = xval
+            # If this `w` is off the edge it and only it gets cleared out
+            if input_kw <= 0 || input_kw > width
+                col_reshaped[w, h, d, kidxs..., c] = T(0)
+                continue
             end
+
+            # Copy the data over
+            xval::T = x[input_kw, input_kh, input_kd, c]
+            col_reshaped[w, h, d, kidxs..., c] = xval
         end
-    end
+    end    
 end
 
 
diff --git a/src/impl/depthwiseconv_direct.jl b/src/impl/depthwiseconv_direct.jl
@@ -18,7 +18,7 @@ channels in `x` is the last, not the second-to-last, as in a normal dense convol
 
 See the docstring for `conv_direct!()` for more on the optional parameters.
 """
-@timeit_debug to function depthwiseconv_direct!(
+function depthwiseconv_direct!(
                 y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
                 w::AbstractArray{wT,5}, cdims::DepthwiseConvDims;
                 alpha::yT = yT(1), beta::yT = yT(0)) where {yT, xT, wT}
@@ -95,7 +95,7 @@ for each batch and channel independently.
 """
 ∇depthwiseconv_data_direct!
 
-@timeit_debug to function ∇depthwiseconv_data_direct!(
+function ∇depthwiseconv_data_direct!(
                 dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
                 w::AbstractArray{wT,5}, cdims::DepthwiseConvDims;
                 alpha::xT=xT(1), beta::xT=xT(0)) where {xT, yT, wT}
@@ -128,7 +128,7 @@ Calculate the gradient imposed upon `w` in the depthwise convolution `y = x * w`
 """
 ∇depthwiseconv_filter_direct!
 
-@timeit_debug to function ∇depthwiseconv_filter_direct!(
+function ∇depthwiseconv_filter_direct!(
                 dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
                 dy::AbstractArray{yT,5}, cdims::DepthwiseConvDims;
                 alpha::wT=wT(1),beta::wT=wT(0)) where {xT, yT, wT}
diff --git a/src/impl/depthwiseconv_im2col.jl b/src/impl/depthwiseconv_im2col.jl
@@ -10,7 +10,7 @@ See `conv_im2col!()` for an explanation of optional parameters.
 """
 depthwiseconv_im2col!
 
-@timeit_debug to function depthwiseconv_im2col!(
+function depthwiseconv_im2col!(
                 y::AbstractArray{T,5}, x::AbstractArray{T,5},
                 w::AbstractArray{T,5}, cdims::DepthwiseConvDims;
                 col::AbstractArray{T,2} = similar(x, im2col_dims(cdims)),
@@ -28,9 +28,7 @@ depthwiseconv_im2col!
 
     dcdims = DenseConvDims(cdims)
     @inbounds for batch_idx in 1:size(x)[end]
-        # We invoke `@timeit_debug` on the outside of `im2col!()` because inference
-        # doesn't like us putting it on the inside.
-        @timeit_debug to "im2col!" im2col!(col, view(x, :, :, :, :, batch_idx), dcdims)
+        im2col!(col, view(x, :, :, :, :, batch_idx), dcdims)
 
         # We do a separate convolution for each channel in x, as we must
         for c_in in 1:channels_in(cdims)
@@ -54,7 +52,7 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
 """
 ∇depthwiseconv_filter_im2col!
 
-@timeit_debug to function ∇depthwiseconv_filter_im2col!(
+function ∇depthwiseconv_filter_im2col!(
                 dw::AbstractArray{T,5}, x::AbstractArray{T,5},
                 dy::AbstractArray{T,5}, cdims::DepthwiseConvDims;
                 col::AbstractArray{T,2} = similar(dw, im2col_dims(cdims)),
@@ -66,9 +64,7 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
     K = prod(output_size(cdims))
 
     @inbounds for batch_idx in 1:size(x)[end]
-        # We invoke `@timeit_debug` on the outside of `im2col!()` because inference
-        # doesn't like us putting it on the inside.
-        @timeit_debug to "im2col!" im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
+        im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
 
         # We do a separate convolution for each channel in x, as we must
         for c_in in 1:channels_in(cdims)
@@ -96,7 +92,7 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
 """
 ∇depthwiseconv_data_im2col!
 
-@timeit_debug to function ∇depthwiseconv_data_im2col!(
+function ∇depthwiseconv_data_im2col!(
                 dx::AbstractArray{T,5}, dy::AbstractArray{T,5},
                 w::AbstractArray{T,5}, cdims::DepthwiseConvDims;
                 col::AbstractArray{T,2} = similar(dx, im2col_dims(cdims)),
@@ -118,7 +114,7 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
                 gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
             end
         end
-        @timeit_debug to "col2im!" col2im!(view(dx, :, :, :, :, batch_idx), col, cdims)
+        col2im!(view(dx, :, :, :, :, batch_idx), col, cdims)
     end
     return dx
 end
diff --git a/src/nnpack/impl.jl b/src/nnpack/impl.jl
@@ -5,7 +5,7 @@ function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4
                            stride = stride(pdims), threadpool = threadpool)
 end
 
-@timeit_debug to function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
+function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
                                        b::A2 = zeros(Float32, size(x, 3)),
                                        algo = UInt32(0)) where {A1<:Array{Float32, 4},
                                                                 A2<:Array{Float32, 1}}
@@ -20,7 +20,7 @@ end
                            stride = stride(cdims), threadpool = threadpool)
 end
 
-@timeit_debug to function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
+function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
                                              algo = UInt32(0)) where{A<:Array{Float32, 4}}
     check_dims(size(dx), size(w), size(dy), cdims)
     threadpool = select_threadpool(cdims, size(y, 4))
@@ -33,7 +33,7 @@ end
                                    stride = stride(cdims), threadpool = threadpool)
 end
 
-@timeit_debug to function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
+function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
                                                algo = UInt32(0)) where{A<:Array{Float32, 4}}
     check_dims(size(x), size(dw), size(dy), cdims)
     threadpool = select_threadpool(cdims, size(y, 4))
diff --git a/src/nnpack/interface.jl b/src/nnpack/interface.jl
@@ -7,7 +7,7 @@ for (front_name, backend) in (
         :∇conv_filter  => :_nnpack,
     )
     @eval begin
-        @timeit_debug to function $(Symbol("$(front_name)$(backend)!"))(
+        function $(Symbol("$(front_name)$(backend)!"))(
                         out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4},
                         cdims::ConvDims; kwargs...) where {T1, T2, T3}
             @warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1
diff --git a/test/inference.jl b/test/inference.jl
@@ -0,0 +1,14 @@
+using NNlib, Test
+using NNlib: conv_direct, conv_im2col
+
+@testset "Conv Inference" begin
+    x = rand(10, 10, 3, 2)
+    w = rand(3, 3, 3, 1)
+
+    impl = [conv, conv_direct, conv_im2col]
+    NNlib.is_nnpack_available() && push!(impl, NNlib.conv_nnpack)
+
+    for T in impl
+        @inferred T(x, w, DenseConvDims(x, w))
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,3 +3,4 @@ using NNlib, Test
 include("activation.jl")
 include("conv.jl")
 include("pooling.jl")
+include("inference.jl")

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ for (front_name, backend) in (`
`7`	`7`	`:∇conv_filter => :_nnpack,`
`8`	`8`	`)`
`9`	`9`	`@eval begin`
`10`		`- @timeit_debug to function $(Symbol("$(front_name)$(backend)!"))(`
	`10`	`+ function $(Symbol("$(front_name)$(backend)!"))(`
`11`	`11`	`out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4},`
`12`	`12`	`cdims::ConvDims; kwargs...) where {T1, T2, T3}`
`13`	`13`	`@warn "Automatically converting input tensor to Float32. This will have performance implications" maxlog=1`