Minor changes as per review

Avik Pal · Avik Pal · commit 2591d2190bb8 · 2019-04-07T03:47:37.000+05:30
diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl
@@ -25,14 +25,14 @@ end
     check_deps()
     status = nnp_initialize()
     if status == nnp_status_unsupported_hardware
-        @warn "HARDWARE is unsupported by NNPACK so falling back to default NNlib"
+        @warn "Hardware is unsupported by NNPACK so falling back to default NNlib"
     else
         include(nnlib_interface_path)
     end
     try
         global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"])
     catch
-        global NNPACK_CPU_THREADS = 4
+        global NNPACK_CPU_THREADS = Sys.CPU_THREADS
     end
-    shared_threadpool = pthreadpool_create(NNPACK_CPU_THREADS)
+    shared_threadpool[] = pthreadpool_create(NNPACK_CPU_THREADS)
 end
diff --git a/src/nnpack/error.jl b/src/nnpack/error.jl
@@ -71,7 +71,7 @@ function NNPACKError(status::nnp_status)
     NNPACKError(status, msg)
 end
 
-macro check(nnp_func)
+macro nnpack_check(nnp_func)
     quote
         local err::nnp_status
         err = $(esc(nnp_func))
diff --git a/src/nnpack/impl.jl b/src/nnpack/impl.jl
@@ -2,12 +2,7 @@
 
 function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}}
     check_dims(size(x), size(y), pdims)
-
-    pad = padding(pdims)
-    stride_ = stride(pdims)
-    kernel = kernel_size(pdims)
-    
-    nnp_max_pooling_output(y, x, kernel, padding = pad, stride = stride_)
+    nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), stride = stride(pdims))
 end
 
 @timeit_debug to function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
@@ -16,43 +11,31 @@ end
                                                                 A2<:Array{Float32, 1}}
     check_dims(size(x), size(w), size(y), cdims)
     
-    flipkernel_ = flipkernel(cdims)
-    if flipkernel_ == 0
+    if flipkernel(cdims) == 0
         w .= flipweight(w)
     end
 
-    pad = padding(cdims)
-    stride_ = stride(cdims)
-
-    nnp_convolution_output(y, x, w, b, algo = algo, padding = pad, stride = stride_)
+    nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), stride = stride(cdims))
 end
 
 @timeit_debug to function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
                                              algo = UInt32(0)) where{A<:Array{Float32, 4}}
     check_dims(size(dx), size(w), size(dy), cdims)
     
-    flipkernel_ = flipkernel(cdims)
-    if flipkernel_ == 0
+    if flipkernel(cdims) == 0
         w .= flipweight(w)
     end
 
-    pad = padding(cdims)
-    stride_ = stride(cdims)
-
-    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = pad, stride = stride_)
+    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), stride = stride(cdims))
 end
 
 @timeit_debug to function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
                                                algo = UInt32(0)) where{A<:Array{Float32, 4}}
     check_dims(size(x), size(dw), size(dy), cdims)
     
-    flipkernel_ = flipkernel(cdims)
-    pad = padding(cdims)
-    stride_ = stride(cdims)
-
-    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = pad, stride = stride_)
+    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), stride = stride(cdims))
 
-    if flipkernel_ == 0
+    if flipkernel(cdims) == 0
         dw .= flipweight(dw)
     end
 
diff --git a/src/nnpack/interface.jl b/src/nnpack/interface.jl
@@ -10,6 +10,7 @@ for (front_name, backend) in (
         @timeit_debug to function $(Symbol("$(front_name)$(backend)!"))(
                         out::Array{T1,4}, in1::Array{T2,4}, in2::Array{T3,4},
                         cdims::ConvDims; kwargs...) where {T1, T2, T3}
+            @warn "Automatically converting $(size(in1)) input tensor to Float32" maxlog=1
             # Output must of the same type as in the function signature
             T1.($(Symbol("$(front_name)$(backend)!"))(Float32.(out), Float32.(in1),
                                                       Float32.(in2), cdims; kwargs...))
@@ -20,6 +21,7 @@ end
 
 function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims;
                          kwargs...) where {T1, T2}
+    @warn "Automatically converting $(size(x)) input tensor to Float32" maxlog=1
     # We want the output to be of the same type as desired
     T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...))
 end
diff --git a/src/nnpack/libnnpack.jl b/src/nnpack/libnnpack.jl
@@ -4,15 +4,15 @@ function nnp_initialize()
 end
 
 function nnp_deinitialize()
-    @check ccall((:nnp_deinitialize, libnnpack), nnp_status, (),)
+    @nnpack_check ccall((:nnp_deinitialize, libnnpack), nnp_status, (),)
 end
 
 function pthreadpool_create(n = 0)
     ccall((:pthreadpool_create, libnnpack), Ptr{Cvoid}, (Csize_t,), n)
 end
 
 function nnp_relu_output(batch_size, channels, input, output, negative_slope, threadpool)
-    @check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool)
+    @nnpack_check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool)
 end
 
 function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = shared_threadpool[]) where {N}
@@ -22,7 +22,7 @@ function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slop
 end
 
 function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
-    @check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
+    @nnpack_check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
 end
 
 function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = shared_threadpool[]) where {N}
@@ -32,7 +32,7 @@ function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::
 end
 
 function nnp_softmax_output(batch_size, channels, input, output, threadpool)
-    @check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool)
+    @nnpack_check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool)
 end
 
 function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = shared_threadpool[])
@@ -44,7 +44,7 @@ end
 #NOTE: This most likely due to nnpack being row major. Investigate this.
 
 function nnp_fully_connected_output(batch_size, input_channels, output_channels, input, kernel, output, threadpool, profile)
-    @check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL)
+    @nnpack_check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL)
 end
 
 function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = shared_threadpool[])
@@ -54,7 +54,7 @@ function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y:
 end
 
 function nnp_fully_connected_inference_f16f32(input_channels, output_channels, input, kernel, output, threadpool)
-    @check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
+    @nnpack_check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
 end
 
 nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = shared_threadpool[]) =
@@ -66,7 +66,7 @@ function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Flo
 end
 
 function nnp_fully_connected_inference(input_channels, output_channels, input, kernel, output, threadpool)
-    @check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
+    @nnpack_check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
 end
 
 nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = shared_threadpool[]) =
@@ -78,7 +78,7 @@ function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2
 end
 
 function nnp_max_pooling_output(batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
-    @check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
+    @nnpack_check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
 end
 
 function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = shared_threadpool[])
@@ -93,7 +93,7 @@ end
 #TODO: Add wrapper for convolution inference
 
 function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
-    @check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
+    @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
 
 function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
@@ -107,7 +107,7 @@ function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,
 end
 
 function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
-    @check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
+    @nnpack_check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
 
 function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
@@ -121,7 +121,7 @@ function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,
 end
 
 function nnp_convolution_output(algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, profile)
-    @check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
+    @nnpack_check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
 
 function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)