Skip to content

Commit 1b4192c

Browse files
committed
Use new capabilities in TimerOutputs.jl to instrument NNlib for performance monitoring
This uses the new zero-overhead instrumentation capabilities of `TimerOutputs.jl` to embed instrumentation that gets compiled out by default, but is trivially enableable (and triggers recompilation of all instrumented methods) by running `TimerOutputs.enable_debug_timings(NNlib)`.
1 parent 494711e commit 1b4192c

File tree

10 files changed

+307
-323
lines changed

10 files changed

+307
-323
lines changed

src/NNlib.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
module NNlib
2-
using Requires
2+
using Requires, TimerOutputs
33

44
# Include APIs
55
include("dim_helpers.jl")
@@ -21,4 +21,7 @@ include("impl/depthwiseconv_im2col.jl")
2121

2222
# Direct implementations of pooling
2323
include("impl/pooling_direct.jl")
24+
25+
to = TimerOutput()
26+
2427
end # module NNlib

src/conv.jl

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ for (front_name, backend) in (
4545
# We only define 3d conv primitives, we reshape lower down to get 1d and 2d convolution
4646
@eval begin
4747
# im2col-accelerated function forwarding definition
48-
function $(Symbol("$(front_name)!"))(
49-
out::AbstractArray{T,5}, in1::AbstractArray{T,5},
50-
in2::AbstractArray{T,5}, cdims::ConvDims; kwargs...) where {T <: $G}
48+
@timeit_debug to function $(Symbol("$(front_name)!"))(
49+
out::AbstractArray{T,5}, in1::AbstractArray{T,5},
50+
in2::AbstractArray{T,5}, cdims::ConvDims; kwargs...) where {T <: $G}
5151
$(Symbol("$(front_name)_$(backend)!"))(out, in1, in2, cdims; kwargs...)
5252
end
5353
end
@@ -62,9 +62,9 @@ for front_name in (:conv, :∇conv_data, :∇conv_filter,
6262
for N in (3, 4)
6363
@eval begin
6464
function $(Symbol("$(front_name)$(backend)!"))(
65-
y::AbstractArray{yT,$N}, x::AbstractArray{xT,$N},
66-
w::AbstractArray{wT,$N}, cdims::ConvDims;
67-
kwargs...) where {yT, xT, wT}
65+
y::AbstractArray{yT,$N}, x::AbstractArray{xT,$N},
66+
w::AbstractArray{wT,$N}, cdims::ConvDims;
67+
kwargs...) where {yT, xT, wT}
6868
$(Symbol("$(front_name)$(backend)!"))(
6969
insert_singleton_spatial_dimension(y, $(5 - N)),
7070
insert_singleton_spatial_dimension(x, $(5 - N)),
@@ -88,43 +88,41 @@ end
8888
for front_name in (:conv, :∇conv_data, :∇conv_filter,
8989
:depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter)
9090
@eval begin
91-
function $(Symbol("$(front_name)!"))(out::AbstractArray, in1::AbstractArray,
92-
in2::AbstractArray, cdims::ConvDims; kwargs...)
93-
@debug "Slow fallback implementation invoked for $(front_name)! You probably don't want this; check your datatypes."
94-
$(Symbol("$(front_name)_direct!"))(out, in1, in2, cdims; kwargs...)
91+
function $(Symbol("$(front_name)!"))(
92+
y::AbstractArray{yT,N}, in1::AbstractArray{T1,N},
93+
in2::AbstractArray{T2,N}, cdims::ConvDims;
94+
kwargs...) where {yT, T1, T2, N}
95+
@debug string("Slow fallback implementation invoked for $(front_name)! ",
96+
"You probably don't want this; check your datatypes.")
97+
$(Symbol("$(front_name)_direct!"))(y, in1, in2, cdims; kwargs...)
9598
end
9699
end
97100
end
98101

99-
# Finally, let's generate auto-allocating versions of all our functions, for all backends:
102+
# Finally, let's generate auto-allocating versions of all our functions, for all backends.
103+
# We `@timeit` these methods separately, as we want to know how much time is spent in
104+
# allocation. :P
100105
for backend in (Symbol(), :_direct, :_im2col)
101106
# First make auto-allocating versions of the conv()-like calls:
102107
for name in (:conv, :depthwiseconv)
103108
@eval begin
104-
function $(Symbol("$(name)$(backend)"))(
105-
x::AbstractArray{xT,N}, w::AbstractArray{wT,N},
106-
cdims::ConvDims; kwargs...) where {xT, wT, N}
107-
yT = promote_type(xT, wT)
108-
# Annoyingly, we must allocate with `zeros()` because if we were to use
109-
# the faster `similar()`, it may have NaNs within it, which will poison
110-
# the output because we support accumulation (even with `beta = 0` the
111-
# NaNs poison us as NaN * 0 == NaN). This is a bit of a shame, but it's
112-
# not really that bad as if you're truly interested in performance, you
113-
# should be allocating your own `y` and calling the non-allocating
114-
# variant of this method anyway.
115-
y = zeros(yT, output_size(cdims)..., channels_out(cdims), size(x, N))
109+
@timeit_debug to function $(Symbol("$(name)$(backend)"))(
110+
x::AbstractArray{xT,N}, w::AbstractArray{wT,N},
111+
cdims::ConvDims; kwargs...) where {xT, wT, N}
112+
y = similar(x, promote_type(xT, wT), output_size(cdims)...,
113+
channels_out(cdims), size(x,N))
116114
return $(Symbol("$(name)$(backend)!"))(y, x, w, cdims; kwargs...)
117115
end
118116
end
119117
end
120118

121119
for name in (:∇conv_data, :∇depthwiseconv_data)
122120
@eval begin
123-
function $(Symbol("$(name)$(backend)"))(
124-
dy::AbstractArray{yT,N}, w::AbstractArray{wT,N},
125-
cdims::cdT; kwargs...) where {yT, wT, N, cdT <: ConvDims}
126-
# Again, allocate with zeros
127-
dx = zeros(yT, input_size(cdims)..., channels_in(cdims), size(dy, N))
121+
@timeit_debug to function $(Symbol("$(name)$(backend)"))(
122+
dy::AbstractArray{yT,N}, w::AbstractArray{wT,N},
123+
cdims::ConvDims; kwargs...) where {yT, wT, N}
124+
dx = similar(dy, input_size(cdims)..., channels_in(cdims),
125+
size(dy, N))
128126
return $(Symbol("$(name)$(backend)!"))(dx, dy, w, cdims; kwargs...)
129127
end
130128
end
@@ -133,23 +131,21 @@ for backend in (Symbol(), :_direct, :_im2col)
133131
# We do the conv/depthwiseconv filter backprops separately, as the shape calculation
134132
# for `w` is slightly different for depthwise than for normal dense convolution.
135133
@eval begin
136-
function $(Symbol("∇conv_filter$(backend)"))(
137-
x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
138-
cdims::cdT; kwargs...) where {xT, yT, N, cdT <: ConvDims}
139-
# Again, allocate with zeros
140-
dw = zeros(yT, kernel_size(cdims)..., channels_in(cdims),
141-
channels_out(cdims))
134+
@timeit_debug to function $(Symbol("∇conv_filter$(backend)"))(
135+
x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
136+
cdims::ConvDims; kwargs...) where {xT, yT, N}
137+
dw = similar(dy, kernel_size(cdims)..., channels_in(cdims),
138+
channels_out(cdims))
142139
return $(Symbol("∇conv_filter$(backend)!"))(dw, x, dy, cdims; kwargs...)
143140
end
144141
end
145142

146143
@eval begin
147-
function $(Symbol("∇depthwiseconv_filter$(backend)"))(
148-
x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
149-
cdims::cdT; kwargs...) where {xT, yT, N, cdT <: ConvDims}
150-
# Again, allocate with zeros
151-
dw = zeros(yT, kernel_size(cdims)..., channel_multiplier(cdims),
152-
channels_in(cdims))
144+
@timeit_debug to function $(Symbol("∇depthwiseconv_filter$(backend)"))(
145+
x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
146+
cdims::ConvDims; kwargs...) where {xT, yT, N}
147+
dw = similar(dy, kernel_size(cdims)..., channel_multiplier(cdims),
148+
channels_in(cdims))
153149
return $(Symbol("∇depthwiseconv_filter$(backend)!"))(dw, x, dy, cdims;
154150
kwargs...)
155151
end

src/gemm.jl

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,23 @@
11
## Low level gemm! call with pointers
2-
## Borrowed from Knet.jl
2+
## Borrowed from Knet.jl, adapted for compile-time constants
33

44
using LinearAlgebra
55
using LinearAlgebra.BLAS: libblas, BlasInt, @blasfunc
66

7+
"""
8+
gemm!()
9+
10+
Low-level gemm!() call with pointers, borrowed from Knet.jl
11+
12+
Calculates `C = alpha*op(A)*op(B) + beta*C`, where:
13+
- `transA` and `transB` set `op(X)` to be either `identity()` or `transpose()`
14+
- alpha and beta are scalars
15+
- op(A) is an (M, K) matrix
16+
- op(B) is a (K, N) matrix
17+
- C is an (M, N) matrix.
18+
"""
19+
gemm!
20+
721
# These are the datatypes we have fast GEMM for
822
gemm_datatype_mappings = (
923
(:dgemm_, Float64),
@@ -13,34 +27,23 @@ gemm_datatype_mappings = (
1327
)
1428
for (gemm, elt) in gemm_datatype_mappings
1529
@eval begin
16-
"""
17-
gemm!()
18-
19-
Low-level gemm!() call with pointers, borrowed from Knet.jl
20-
21-
Calculates `C = alpha*op(A)*op(B) + beta*C`, where:
22-
- `transA` and `transB` set `op(X)` to be either `identity()` or `transpose()`
23-
- alpha and beta are scalars
24-
- op(A) is an (M, K) matrix
25-
- op(B) is a (K, N) matrix
26-
- C is an (M, N) matrix.
27-
"""
28-
@inline function gemm!(transA::Val, transB::Val, M::Int, N::Int, K::Int,
30+
@inline @timeit_debug to function gemm!(transA::Val, transB::Val,
31+
M::Int, N::Int, K::Int,
2932
alpha::$(elt), A::Ptr{$elt}, B::Ptr{$elt},
3033
beta::$(elt), C::Ptr{$elt})
3134
# Convert our compile-time transpose marker to a char for BLAS
3235
convtrans(V::Val{false}) = 'N'
3336
convtrans(V::Val{true}) = 'T'
3437

35-
if transA==Val(false)
36-
lda=M
38+
if transA == Val(false)
39+
lda = M
3740
else
38-
lda=K
41+
lda = K
3942
end
4043
if transB == Val(false)
41-
ldb=K
44+
ldb = K
4245
else
43-
ldb=N
46+
ldb = N
4447
end
4548
ldc = M
4649
ccall((@blasfunc($(gemm)), libblas), Nothing,

src/impl/conv_direct.jl

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,20 @@ calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonz
3333
value, the user is able to accumulate values into a preallocated `y` buffer, or by
3434
setting `alpha` to a nonunitary value, an arbitrary gain factor can be applied.
3535
36+
By defaulting `beta` to `false`, we make use of the Bradbury promotion trick to override
37+
`NaN`'s that may pre-exist within our output buffer, as `false*NaN == 0.0`, whereas
38+
`0.0*NaN == NaN`. Only set `beta` if you are certain that none of the elements within
39+
`y` are `NaN`.
40+
3641
The basic implementation performs 3-dimensional convolution; 1-dimensional and 2-
3742
dimensional casesa are supported by simply reshaping `y`, `x` and `w`, for which
3843
wrapper methods are available.
3944
"""
40-
function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
45+
conv_direct!
46+
47+
@timeit_debug to function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
4148
w::AbstractArray{wT,5}, cdims::DenseConvDims;
42-
alpha::yT = yT(1), beta::yT = yT(0)) where {yT, xT, wT}
49+
alpha::yT = yT(1), beta = false) where {yT, xT, wT}
4350
check_dims(size(x), size(w), size(y), cdims)
4451

4552
width, height, depth = input_size(cdims)
@@ -50,12 +57,13 @@ function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
5057
stride_w, stride_h, stride_d = stride(cdims)
5158
out_width, out_height, out_depth = output_size(cdims)
5259

53-
project(idx, s, p) = (idx - 1)*s - p + 1
54-
5560
# If we're doing crosscorr instead of conv, then don't bother to flip `w`
5661
if !flipkernel(cdims)
5762
w = w[end:-1:1, end:-1:1, end:-1:1, :, :]
5863
end
64+
65+
# A helper function to project from output (w, h) to input (input_w, input_h)
66+
@inline project(idx, stride, pad) = (idx - 1)*stride - pad + 1
5967

6068
# explicit formulation of convolution. Oh hoisting gods, hear my plea.
6169
@inbounds for batch in 1:size(x)[end],
@@ -94,7 +102,7 @@ function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
94102
y[w_idx, h_idx, d_idx, c_out, batch] = alpha*convert(yT, dotprod) +
95103
beta*y[w_idx, h_idx, d_idx, c_out, batch]
96104
end
97-
105+
98106
return y
99107
end
100108

@@ -104,27 +112,34 @@ end
104112
105113
Calculate the gradient imposed upon `x` in the convolution `y = x * w`.
106114
"""
107-
function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
115+
∇conv_data_direct!
116+
117+
@timeit_debug to function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
108118
w::AbstractArray{wT,5}, cdims::DenseConvDims;
109-
alpha::xT=xT(1), beta::xT=xT(0)) where {xT, yT, wT}
110-
w = transpose_flipbatch(w[end:-1:1, end:-1:1, end:-1:1, :, :])
119+
alpha::xT=xT(1), beta=false) where {xT, yT, wT}
120+
w = transpose_swapbatch(w[end:-1:1, end:-1:1, end:-1:1, :, :])
111121
dy = predilate(dy, stride(cdims))
112122
ctdims = DenseConvDims(dy, w; padding=transpose_pad(cdims),
113-
dilation=dilation(cdims), flipkernel=flipkernel(cdims))
114-
return transpose_flipbatch(conv_direct!(dx, dy, w, ctdims; alpha=alpha, beta=beta))
123+
dilation=dilation(cdims),
124+
flipkernel=flipkernel(cdims))
125+
dx = conv_direct!(dx, dy, w, ctdims; alpha=alpha, beta=beta)
126+
return transpose_swapbatch(dx)
115127
end
116128

117129
"""
118130
∇conv_filter_direct!(dw, x, dy, cdims; alpha=1, beta=0)
119131
120132
Calculate the gradient imposed upon `w` in the convolution `y = x * w`.
121133
"""
122-
function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
134+
∇conv_filter_direct!
135+
136+
@timeit_debug to function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
123137
dy::AbstractArray{yT,5}, cdims::DenseConvDims;
124-
alpha::wT=wT(1), beta::wT=wT(0)) where {xT, yT, wT}
125-
x = transpose_flipbatch(x[end:-1:1, end:-1:1, end:-1:1, :, :])
126-
dy = transpose_flipbatch(predilate(dy, stride(cdims)))
127-
ctdims = DenseConvDims(dy, x; padding=transpose_pad(cdims), stride=dilation(cdims))
138+
alpha::wT=wT(1), beta=false) where {xT, yT, wT}
139+
x = transpose_swapbatch(x[end:-1:1, end:-1:1, end:-1:1, :, :])
140+
dy = transpose_swapbatch(predilate(dy, stride(cdims)))
141+
ctdims = DenseConvDims(dy, x; padding=transpose_pad(cdims),
142+
stride=dilation(cdims))
128143
conv_direct!(dw, dy, x, ctdims; alpha=alpha, beta=beta)
129144
if flipkernel(cdims)
130145
dw .= dw[end:-1:1, end:-1:1, end:-1:1, :, :]

0 commit comments

Comments
 (0)