Some heuristics to choose threadpool for pooling

Avik Pal · Avik Pal · commit 5b35148e8b67 · 2019-04-11T17:26:47.000+05:30
diff --git a/src/NNlib.jl b/src/NNlib.jl
@@ -24,8 +24,6 @@ include("impl/depthwiseconv_im2col.jl")
 # Direct implementations of pooling
 include("impl/pooling_direct.jl")
 
-to = TimerOutput()
-
 if Sys.islinux()
     include("nnpack/NNPACK.jl")
 else
diff --git a/src/nnpack/NNPACK.jl b/src/nnpack/NNPACK.jl
@@ -50,7 +50,10 @@ end
     try
         global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"])
     catch
-        global NNPACK_CPU_THREADS = Sys.CPU_THREADS
+        # Sys.CPU_THREADS should be a better default if we are tuning the benchmark suite on
+        # a particular machine. However, we fix the runtime threadpool here to have a max of
+        # 4 threads so anything above will be ignored anyways
+        global NNPACK_CPU_THREADS = UInt64(4)
     end
     allocate_threadpool()
 end
diff --git a/src/nnpack/interface.jl b/src/nnpack/interface.jl
@@ -19,9 +19,33 @@ for (front_name, backend) in (
 end
 
 
+function conv_nnpack(x::Array{T1, 4}, w::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2}
+    y = similar(x, output_size(cdims), channels_out(cdims), size(x, 4))
+    return conv_nnpack!(y, x, w, cdims; kwargs...)
+end
+
+
+function ∇conv_data(dy::Array{T1, 4}, w::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2}
+    dx = similar(dy, input_size(cdims), channels_in(cdims), size(dy, 4))
+    return ∇conv_data!(dx, dy, w, cdims; kwargs...)
+end
+
+
+function ∇conv_filter(x::Array{T1, 4}, dy::Array{T2, 4}, cdims::ConvDims; kwargs...) where {T1, T2}
+    dw = similar(x, kernel_size(cdims), channels_in(cdims), channels_out(cdims))
+    return ∇conv_filter!(dw, x, dy, cdims; kwargs...)
+end
+
+
 function maxpool_nnpack!(y::Array{T1, 4}, x::Array{T2, 4}, pdims::PoolDims;
                          kwargs...) where {T1, T2}
     @warn "Automatically converting $(size(x)) input tensor to Float32" maxlog=1
     # We want the output to be of the same type as desired
     T1.(maxpool_nnpack!(Float32.(y), Float32.(x), pdims; kwargs...))
 end
+
+
+function maxpool_nnpack(x::Array{T, 4}, pdims::PoolDims; kwargs...) where {T}
+    y = similar(x, output_size(pdims)..., channels_out(pdims), size(x, 4))
+    return maxpool_nnpack!(y, x, pdims; kwargs...)
+end
diff --git a/src/nnpack/performance.jl b/src/nnpack/performance.jl
@@ -3,5 +3,15 @@ function select_threadpool(cdims::DenseConvDims, batch_size::Int)
 end
 
 function select_threadpool(pdims::PoolDims, batch_size::Int)
+    inp_size = input_size(pdims)[1] 
+    if batch_size >= 32
+        return shared_threadpool_dict[4][]
+    elseif batch_size >= 16 && inp_size >= 64
+        return shared_threadpool_dict[4][]
+    elseif inp_size >= 128
+        return shared_threadpool_dict[4][]
+    elseif inp_size * batch_size >= 256
+        return shared_threadpool_dict[4][]
+    end    
     return C_NULL
 end