Lay down the structure for runtime performance check

Avik Pal · Avik Pal · commit 8ced0c0e5f00 · 2019-04-11T14:35:58.000+05:30
diff --git a/src/dim_helpers.jl b/src/dim_helpers.jl
@@ -119,4 +119,18 @@ function predilate(x::AbstractArray{T,N}, dilation::NTuple{M}) where {T, N, M}
     # zeros between each element of `x` along each spatial dimension.
     x_dil[(1:dilation[idx]:size(x_dil,idx) for idx in 1:(N-2))..., :, :] .= x
     return x_dil
-end
+end
+
+"""
+    flipweight(w::AbstractArray)
+
+Reorders the weight tensor for supporting both convolution and cross-correlation operations.
+"""
+
+# For any array with ndims <= 3 it makes no sense to flip the weights so simply return the
+# original array
+@inline flipweight(w::AbstractArray) = w
+
+@inline flipweight(w::AbstractArray{T, 4}) where {T} = w[end:-1:1, end:-1:1, :, :]
+
+@inline flipweight(w::AbstractArray{T, 5}) where {T} = w[end:-1:1, end:-1:1, end:-1:1, :, :]
diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl
@@ -1,16 +1,22 @@
 include("libnnpack_types.jl")
 include("error.jl")
 include("libnnpack.jl")
+include("performance.jl")
+include("interface.jl")
 
 const depsjl_path = joinpath(dirname(@__FILE__), "..", "..", "deps", "deps.jl")
 if !isfile(depsjl_path)
     error("NNPACK not installed properly, run Pkg.build(\"NNlib\"), restart Julia and try again")
 end
 include(depsjl_path)
 
-const nnlib_interface_path = joinpath(dirname(@__FILE__), "interface.jl")
-const shared_threadpool = Ref(C_NULL)
+const shared_threadpool_dict = Dict{UInt64, Base.RefValue}()
 
+"""
+    is_nnpack_available()
+
+Checks if the current hardware is supported by NNPACK.
+"""
 function is_nnpack_available()
     check_deps()
     status = nnp_initialize()
@@ -21,18 +27,30 @@ function is_nnpack_available()
     end
 end
 
+"""
+    allocate_threadpool()
+
+Allocates several threadpool based on the upper limit on the number of threads for the machine.
+Allows NNPACK to intelligently choose which threadpool to use for getting the best
+performance.
+"""
+function allocate_threadpool()
+    for i in 1:Int(floor(log2(NNPACK_CPU_THREADS)))
+        threads = UInt64(2^i)
+        push!(shared_threadpool_dict, threads => Ref(pthreadpool_create(threads)))
+    end
+end
+
 @init begin
     check_deps()
     status = nnp_initialize()
     if status == nnp_status_unsupported_hardware
         @warn "Hardware is unsupported by NNPACK so falling back to default NNlib"
-    else
-        include(nnlib_interface_path)
     end
     try
         global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"])
     catch
         global NNPACK_CPU_THREADS = Sys.CPU_THREADS
     end
-    shared_threadpool[] = pthreadpool_create(NNPACK_CPU_THREADS)
+    allocate_threadpool()
 end
diff --git a/src/nnpack/impl.jl b/src/nnpack/impl.jl
@@ -1,39 +1,45 @@
-@inline flipweight(w::Array{<:Any, 4}) = w[end:-1:1,end:-1:1,:,:]
-
 function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}}
     check_dims(size(x), size(y), pdims)
-    nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), stride = stride(pdims))
+    threadpool = select_threadpool(pdims, size(y, 4))
+    nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims),
+                           stride = stride(pdims), threadpool = threadpool)
 end
 
 @timeit_debug to function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
                                        b::A2 = zeros(Float32, size(x, 3)),
                                        algo = UInt32(0)) where {A1<:Array{Float32, 4},
                                                                 A2<:Array{Float32, 1}}
     check_dims(size(x), size(w), size(y), cdims)
-    
+    threadpool = select_threadpool(cdims, size(y, 4))
+
     if flipkernel(cdims) == 0
         w .= flipweight(w)
     end
 
-    nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), stride = stride(cdims))
+    nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims),
+                           stride = stride(cdims), threadpool = threadpool)
 end
 
 @timeit_debug to function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
                                              algo = UInt32(0)) where{A<:Array{Float32, 4}}
     check_dims(size(dx), size(w), size(dy), cdims)
+    threadpool = select_threadpool(cdims, size(y, 4))
     
     if flipkernel(cdims) == 0
         w .= flipweight(w)
     end
 
-    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), stride = stride(cdims))
+    nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims),
+                                   stride = stride(cdims), threadpool = threadpool)
 end
 
 @timeit_debug to function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
                                                algo = UInt32(0)) where{A<:Array{Float32, 4}}
     check_dims(size(x), size(dw), size(dy), cdims)
+    threadpool = select_threadpool(cdims, size(y, 4))
     
-    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), stride = stride(cdims))
+    nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims),
+                                    stride = stride(cdims), threadpool = threadpool)
 
     if flipkernel(cdims) == 0
         dw .= flipweight(dw)
diff --git a/src/nnpack/libnnpack.jl b/src/nnpack/libnnpack.jl
@@ -15,7 +15,7 @@ function nnp_relu_output(batch_size, channels, input, output, negative_slope, th
     @nnpack_check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool)
 end
 
-function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = shared_threadpool[]) where {N}
+function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N}
     # Investigate why the channel and batch dims need to specified like this
     nnp_relu_output(prod(size(x)[N-1:N]), prod(size(x)[1:N-2]), x, y, negative_slope, threadpool)
     y
@@ -25,7 +25,7 @@ function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_
     @nnpack_check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
 end
 
-function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = shared_threadpool[]) where {N}
+function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N}
     # Investigate why the channel and batch dims need to specified like this
     nnp_relu_input_gradient(Csize_t(prod(size(x)[N-1:N])), prod(size(x)[1:N-2]), dy, x, dx, negative_slope, threadpool)
     dx
@@ -35,7 +35,7 @@ function nnp_softmax_output(batch_size, channels, input, output, threadpool)
     @nnpack_check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool)
 end
 
-function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = shared_threadpool[])
+function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = C_NULL)
     nnp_softmax_output(ndims(x) == 2 ? size(x, 2) : 1, size(x, 1), x, y, threadpool)
     y
 end
@@ -47,7 +47,7 @@ function nnp_fully_connected_output(batch_size, input_channels, output_channels,
     @nnpack_check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL)
 end
 
-function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = shared_threadpool[])
+function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = C_NULL)
     profile = profile == nothing ? nnp_profile() : profile
     nnp_fully_connected_output(size(x, 2), size(x, 1), size(w, 1), x, w, y, threadpool, profile)
     y
@@ -57,10 +57,10 @@ function nnp_fully_connected_inference_f16f32(input_channels, output_channels, i
     @nnpack_check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
 end
 
-nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = shared_threadpool[]) =
+nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = C_NULL) =
     nnp_fully_connected_inference(reshape(x, size(x), 1), w, reshape(y, size(y), 1), threadpool = threadpool)
 
-function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = shared_threadpool[])
+function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = C_NULL)
     nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
     y
 end
@@ -69,10 +69,10 @@ function nnp_fully_connected_inference(input_channels, output_channels, input, k
     @nnpack_check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
 end
 
-nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = shared_threadpool[]) =
+nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = C_NULL) =
     nnp_fully_connected_inference(reshape(x, size(x), 1), w, threadpool = threadpool)
 
-function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = shared_threadpool[])
+function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = C_NULL)
     nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
     y
 end
@@ -81,7 +81,7 @@ function nnp_max_pooling_output(batch_size, channels, input_size, input_padding,
     @nnpack_check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
 end
 
-function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = shared_threadpool[])
+function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = C_NULL)
     input_size = nnp_size(Csize_t.((size(x, 1), size(x, 2)))...)
     pooling_size = nnp_size(Csize_t.(kernel)...)
     input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
@@ -96,7 +96,7 @@ function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, o
     @nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
 
-function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
+function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
     input_size = nnp_size(Csize_t.((size(dx,1), size(dx,2)))...)
     kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
     input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
@@ -110,7 +110,7 @@ function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels,
     @nnpack_check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
 
-function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
+function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
     input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
     kernel_size = nnp_size(Csize_t.((size(dw,1),size(dw,2)))...)
     input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
@@ -124,7 +124,7 @@ function nnp_convolution_output(algorithm, batch_size, input_channels, output_ch
     @nnpack_check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
 end
 
-function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
+function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
     input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
     kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
     input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
diff --git a/src/nnpack/performance.jl b/src/nnpack/performance.jl
@@ -0,0 +1,7 @@
+function select_threadpool(cdims::DenseConvDims, batch_size::Int)
+    return C_NULL
+end
+
+function select_threadpool(pdims::PoolDims, batch_size::Int)
+    return C_NULL
+end