Skip to content

Commit 8ced0c0

Browse files
author
Avik Pal
committed
Lay down the structure for runtime performance check
1 parent 2591d21 commit 8ced0c0

File tree

5 files changed

+70
-25
lines changed

5 files changed

+70
-25
lines changed

src/dim_helpers.jl

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,18 @@ function predilate(x::AbstractArray{T,N}, dilation::NTuple{M}) where {T, N, M}
119119
# zeros between each element of `x` along each spatial dimension.
120120
x_dil[(1:dilation[idx]:size(x_dil,idx) for idx in 1:(N-2))..., :, :] .= x
121121
return x_dil
122-
end
122+
end
123+
124+
"""
125+
flipweight(w::AbstractArray)
126+
127+
Reorders the weight tensor for supporting both convolution and cross-correlation operations.
128+
"""
129+
130+
# For any array with ndims <= 3 it makes no sense to flip the weights so simply return the
131+
# original array
132+
@inline flipweight(w::AbstractArray) = w
133+
134+
@inline flipweight(w::AbstractArray{T, 4}) where {T} = w[end:-1:1, end:-1:1, :, :]
135+
136+
@inline flipweight(w::AbstractArray{T, 5}) where {T} = w[end:-1:1, end:-1:1, end:-1:1, :, :]

src/nnpack/NNPACK.jl

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
11
include("libnnpack_types.jl")
22
include("error.jl")
33
include("libnnpack.jl")
4+
include("performance.jl")
5+
include("interface.jl")
46

57
const depsjl_path = joinpath(dirname(@__FILE__), "..", "..", "deps", "deps.jl")
68
if !isfile(depsjl_path)
79
error("NNPACK not installed properly, run Pkg.build(\"NNlib\"), restart Julia and try again")
810
end
911
include(depsjl_path)
1012

11-
const nnlib_interface_path = joinpath(dirname(@__FILE__), "interface.jl")
12-
const shared_threadpool = Ref(C_NULL)
13+
const shared_threadpool_dict = Dict{UInt64, Base.RefValue}()
1314

15+
"""
16+
is_nnpack_available()
17+
18+
Checks if the current hardware is supported by NNPACK.
19+
"""
1420
function is_nnpack_available()
1521
check_deps()
1622
status = nnp_initialize()
@@ -21,18 +27,30 @@ function is_nnpack_available()
2127
end
2228
end
2329

30+
"""
31+
allocate_threadpool()
32+
33+
Allocates several threadpool based on the upper limit on the number of threads for the machine.
34+
Allows NNPACK to intelligently choose which threadpool to use for getting the best
35+
performance.
36+
"""
37+
function allocate_threadpool()
38+
for i in 1:Int(floor(log2(NNPACK_CPU_THREADS)))
39+
threads = UInt64(2^i)
40+
push!(shared_threadpool_dict, threads => Ref(pthreadpool_create(threads)))
41+
end
42+
end
43+
2444
@init begin
2545
check_deps()
2646
status = nnp_initialize()
2747
if status == nnp_status_unsupported_hardware
2848
@warn "Hardware is unsupported by NNPACK so falling back to default NNlib"
29-
else
30-
include(nnlib_interface_path)
3149
end
3250
try
3351
global NNPACK_CPU_THREADS = parse(UInt64, ENV["NNPACK_CPU_THREADS"])
3452
catch
3553
global NNPACK_CPU_THREADS = Sys.CPU_THREADS
3654
end
37-
shared_threadpool[] = pthreadpool_create(NNPACK_CPU_THREADS)
55+
allocate_threadpool()
3856
end

src/nnpack/impl.jl

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,45 @@
1-
@inline flipweight(w::Array{<:Any, 4}) = w[end:-1:1,end:-1:1,:,:]
2-
31
function maxpool_nnpack!(y::A, x::A, pdims::PoolDims) where {A<:Array{Float32, 4}}
42
check_dims(size(x), size(y), pdims)
5-
nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims), stride = stride(pdims))
3+
threadpool = select_threadpool(pdims, size(y, 4))
4+
nnp_max_pooling_output(y, x, kernel_size(pdims), padding = padding(pdims),
5+
stride = stride(pdims), threadpool = threadpool)
66
end
77

88
@timeit_debug to function conv_nnpack!(y::A1, x::A1, w::A1, cdims::ConvDims;
99
b::A2 = zeros(Float32, size(x, 3)),
1010
algo = UInt32(0)) where {A1<:Array{Float32, 4},
1111
A2<:Array{Float32, 1}}
1212
check_dims(size(x), size(w), size(y), cdims)
13-
13+
threadpool = select_threadpool(cdims, size(y, 4))
14+
1415
if flipkernel(cdims) == 0
1516
w .= flipweight(w)
1617
end
1718

18-
nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims), stride = stride(cdims))
19+
nnp_convolution_output(y, x, w, b, algo = algo, padding = padding(cdims),
20+
stride = stride(cdims), threadpool = threadpool)
1921
end
2022

2123
@timeit_debug to function ∇conv_data_nnpack!(dx::A, dy::A, w::A, cdims::ConvDims;
2224
algo = UInt32(0)) where{A<:Array{Float32, 4}}
2325
check_dims(size(dx), size(w), size(dy), cdims)
26+
threadpool = select_threadpool(cdims, size(y, 4))
2427

2528
if flipkernel(cdims) == 0
2629
w .= flipweight(w)
2730
end
2831

29-
nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims), stride = stride(cdims))
32+
nnp_convolution_input_gradient(dx, dy, w, algo = algo, padding = padding(cdims),
33+
stride = stride(cdims), threadpool = threadpool)
3034
end
3135

3236
@timeit_debug to function ∇conv_filter_nnpack!(dw::A, x::A, dy::A, cdims::ConvDims;
3337
algo = UInt32(0)) where{A<:Array{Float32, 4}}
3438
check_dims(size(x), size(dw), size(dy), cdims)
39+
threadpool = select_threadpool(cdims, size(y, 4))
3540

36-
nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims), stride = stride(cdims))
41+
nnp_convolution_kernel_gradient(dw, x, dy, algo = algo, padding = padding(cdims),
42+
stride = stride(cdims), threadpool = threadpool)
3743

3844
if flipkernel(cdims) == 0
3945
dw .= flipweight(dw)

src/nnpack/libnnpack.jl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ function nnp_relu_output(batch_size, channels, input, output, negative_slope, th
1515
@nnpack_check ccall((:nnp_relu_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, input, output, negative_slope, threadpool)
1616
end
1717

18-
function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = shared_threadpool[]) where {N}
18+
function nnp_relu_output(x::Array{Float32,N}, y::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N}
1919
# Investigate why the channel and batch dims need to specified like this
2020
nnp_relu_output(prod(size(x)[N-1:N]), prod(size(x)[1:N-2]), x, y, negative_slope, threadpool)
2121
y
@@ -25,7 +25,7 @@ function nnp_relu_input_gradient(batch_size, channels, grad_output, input, grad_
2525
@nnpack_check ccall((:nnp_relu_input_gradient, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Cfloat, pthreadpool_t), batch_size, channels, grad_output, input, grad_input, negative_slope, threadpool)
2626
end
2727

28-
function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = shared_threadpool[]) where {N}
28+
function nnp_relu_input_gradient(x::Array{Float32,N}, dy::Array{Float32,N}, dx::Array{Float32,N}; negative_slope::AbstractFloat = 0.0, threadpool = C_NULL) where {N}
2929
# Investigate why the channel and batch dims need to specified like this
3030
nnp_relu_input_gradient(Csize_t(prod(size(x)[N-1:N])), prod(size(x)[1:N-2]), dy, x, dx, negative_slope, threadpool)
3131
dx
@@ -35,7 +35,7 @@ function nnp_softmax_output(batch_size, channels, input, output, threadpool)
3535
@nnpack_check ccall((:nnp_softmax_output, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input, output, threadpool)
3636
end
3737

38-
function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = shared_threadpool[])
38+
function nnp_softmax_output(x::VecOrMat{Float32}, y::VecOrMat{Float32}; threadpool = C_NULL)
3939
nnp_softmax_output(ndims(x) == 2 ? size(x, 2) : 1, size(x, 1), x, y, threadpool)
4040
y
4141
end
@@ -47,7 +47,7 @@ function nnp_fully_connected_output(batch_size, input_channels, output_channels,
4747
@nnpack_check ccall((:nnp_fully_connected_output, libnnpack), nnp_status, (Csize_t, Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t, Ptr{Cvoid}), batch_size, input_channels, output_channels, input, kernel, output, threadpool, C_NULL)
4848
end
4949

50-
function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = shared_threadpool[])
50+
function nnp_fully_connected_output(x::Array{Float32,2}, w::Array{Float32,2}, y::Array{Float32,2}; profile = nothing, threadpool = C_NULL)
5151
profile = profile == nothing ? nnp_profile() : profile
5252
nnp_fully_connected_output(size(x, 2), size(x, 1), size(w, 1), x, w, y, threadpool, profile)
5353
y
@@ -57,10 +57,10 @@ function nnp_fully_connected_inference_f16f32(input_channels, output_channels, i
5757
@nnpack_check ccall((:nnp_fully_connected_inference_f16f32, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cvoid}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
5858
end
5959

60-
nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = shared_threadpool[]) =
60+
nnp_fully_connected_inference_f16f32(x::Array{Float32, 1}, w::Array{Float16,2}, y::Array{Float32, 1}; threadpool = C_NULL) =
6161
nnp_fully_connected_inference(reshape(x, size(x), 1), w, reshape(y, size(y), 1), threadpool = threadpool)
6262

63-
function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = shared_threadpool[])
63+
function nnp_fully_connected_inference_f16f32(x::Array{Float32, 2}, w::Array{Float16,2}, y::Array{Float32, 2}; threadpool = C_NULL)
6464
nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
6565
y
6666
end
@@ -69,10 +69,10 @@ function nnp_fully_connected_inference(input_channels, output_channels, input, k
6969
@nnpack_check ccall((:nnp_fully_connected_inference, libnnpack), nnp_status, (Csize_t, Csize_t, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), input_channels, output_channels, input, kernel, output, threadpool)
7070
end
7171

72-
nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = shared_threadpool[]) =
72+
nnp_fully_connected_inference(x::Array{Float32, 1}, w::Array{Float32,2}; threadpool = C_NULL) =
7373
nnp_fully_connected_inference(reshape(x, size(x), 1), w, threadpool = threadpool)
7474

75-
function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = shared_threadpool[])
75+
function nnp_fully_connected_inference(x::Array{Float32, 2}, w::Array{Float32, 2}, y::Array{Float32, 2}; threadpool = C_NULL)
7676
nnp_fully_connected_inference(size(x, 1), size(y, 1), x, w, y, threadpool)
7777
y
7878
end
@@ -81,7 +81,7 @@ function nnp_max_pooling_output(batch_size, channels, input_size, input_padding,
8181
@nnpack_check ccall((:nnp_max_pooling_output, libnnpack), nnp_status, (Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, pthreadpool_t), batch_size, channels, input_size, input_padding, pooling_size, pooling_stride, input, output, threadpool)
8282
end
8383

84-
function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = shared_threadpool[])
84+
function nnp_max_pooling_output(y::Array{Float32,4}, x::Array{Float32,4}, kernel::Tuple; padding = 0, stride = 1, threadpool = C_NULL)
8585
input_size = nnp_size(Csize_t.((size(x, 1), size(x, 2)))...)
8686
pooling_size = nnp_size(Csize_t.(kernel)...)
8787
input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
@@ -96,7 +96,7 @@ function nnp_convolution_input_gradient(algorithm, batch_size, input_channels, o
9696
@nnpack_check ccall((:nnp_convolution_input_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, grad_output, kernel, grad_input, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
9797
end
9898

99-
function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
99+
function nnp_convolution_input_gradient(dx::Array{Float32,4}, dy::Array{Float32,4}, w::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
100100
input_size = nnp_size(Csize_t.((size(dx,1), size(dx,2)))...)
101101
kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
102102
input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
@@ -110,7 +110,7 @@ function nnp_convolution_kernel_gradient(algorithm, batch_size, input_channels,
110110
@nnpack_check ccall((:nnp_convolution_kernel_gradient, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, grad_output, grad_kernel, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
111111
end
112112

113-
function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
113+
function nnp_convolution_kernel_gradient(dw::Array{Float32,4}, x::Array{Float32,4}, dy::Array{Float32,4}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
114114
input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
115115
kernel_size = nnp_size(Csize_t.((size(dw,1),size(dw,2)))...)
116116
input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))
@@ -124,7 +124,7 @@ function nnp_convolution_output(algorithm, batch_size, input_channels, output_ch
124124
@nnpack_check ccall((:nnp_convolution_output, libnnpack), nnp_status, (nnp_convolution_algorithm, Csize_t, Csize_t, Csize_t, nnp_size, nnp_padding, nnp_size, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cvoid}, Csize_t, nnp_activation, Ptr{Cvoid}, pthreadpool_t, Ptr{Cvoid}), algorithm, batch_size, input_channels, output_channels, input_size, input_padding, kernel_size, input, kernel, bias, output, workspace_buffer, workspace_size, activation, activation_parameters, threadpool, C_NULL)
125125
end
126126

127-
function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = shared_threadpool[], profile = nothing)
127+
function nnp_convolution_output(y::Array{Float32,4}, x::Array{Float32,4}, w::Array{Float32,4}, b::Array{Float32,1}; algo::nnp_convolution_algorithm = UInt32(0), workspace_buffer = nothing, workspace_size = 0, padding = 0, stride = 1, threadpool = C_NULL, profile = nothing)
128128
input_size = nnp_size(Csize_t.((size(x,1), size(x,2)))...)
129129
kernel_size = nnp_size(Csize_t.((size(w,1),size(w,2)))...)
130130
input_padding = nnp_padding(Csize_t(padding[2]), Csize_t(padding[1]), Csize_t(padding[2]), Csize_t(padding[1]))

src/nnpack/performance.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
function select_threadpool(cdims::DenseConvDims, batch_size::Int)
2+
return C_NULL
3+
end
4+
5+
function select_threadpool(pdims::PoolDims, batch_size::Int)
6+
return C_NULL
7+
end

0 commit comments

Comments
 (0)