Skip to content

Commit 425cc59

Browse files
authored
Use GPU-friendly rounding (#591)
1 parent 627374c commit 425cc59

File tree

2 files changed

+90
-87
lines changed

2 files changed

+90
-87
lines changed

src/sampling.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ end
121121
ix = compute_source_index(x, iW, padding_mode)
122122
iy = compute_source_index(y, iH, padding_mode)
123123
# Get corner pixel values from (ix, iy) in north-east-south-west directions.
124-
ix_nw, iy_nw = floor(Int, ix), floor(Int, iy)
124+
ix_nw, iy_nw = unsafe_trunc(Int, floor(ix)), unsafe_trunc(Int, floor(iy))
125125
ix_ne, iy_ne = ix_nw + 1, iy_nw
126126
ix_sw, iy_sw = ix_nw, iy_nw + 1
127127
ix_se, iy_se = ix_ne, iy_sw
@@ -194,7 +194,7 @@ end
194194
ix, gix_mult = ∇compute_source_index(x, iW, padding_mode)
195195
iy, giy_mult = ∇compute_source_index(y, iH, padding_mode)
196196
# Get corner pixel values from (ix, iy) in north-east-south-west directions.
197-
ix_nw, iy_nw = floor(Int, ix), floor(Int, iy)
197+
ix_nw, iy_nw = unsafe_trunc(Int, floor(ix)), unsafe_trunc(Int, floor(iy))
198198
ix_ne, iy_ne = ix_nw + 1, iy_nw
199199
ix_sw, iy_sw = ix_nw, iy_nw + 1
200200
ix_se, iy_se = ix_ne, iy_sw

src/upsample.jl

Lines changed: 88 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -395,53 +395,52 @@ end
395395
@kernel function _upsample_linear_kernel!(::CPU, y::T, x::T, rwidth, align::Val{A}) where {
396396
T <: AbstractArray{<:Any, 3}, A,
397397
}
398-
@uniform in_width::UInt32, channels::UInt32, batch::UInt32 = size(x)
399-
@uniform out_width::UInt32 = size(y, 1)
400-
c::UInt32, n::UInt32 = @index(Global, NTuple)
398+
@uniform in_width, channels, batch = size(x)
399+
@uniform out_width = size(y, 1)
400+
c, n = @index(Global, NTuple)
401401
yv, xv = @view(y[:, c, n]), @view(x[:, c, n])
402-
@inbounds for i in UnitRange{UInt32}(one(UInt32), out_width)
403-
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, in_width)
402+
@inbounds for i in 1:out_width
403+
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, in_width)
404404
yv[i] = w0λ * xv[iw0] + w1λ * xv[iw1]
405405
end
406406
end
407407

408408
@kernel function _∇upsample_linear_kernel!(::CPU, dx::T1, Δ::T2, rwidth, align::Val{A}) where {
409409
T1 <: AbstractArray{<:Any, 3}, T2 <: AbstractArray{<:Any, 3}, A,
410410
}
411-
@uniform in_width::UInt32, channels::UInt32, batch::UInt32 = size(Δ)
412-
@uniform out_width::UInt32 = size(dx, 1)
413-
c::UInt32, n::UInt32 = @index(Global, NTuple)
411+
@uniform in_width, channels, batch = size(Δ)
412+
@uniform out_width = size(dx, 1)
413+
c, n = @index(Global, NTuple)
414414
Δv, dxv = @view(Δ[:, c, n]), @view(dx[:, c, n])
415-
@inbounds for i in UnitRange{UInt32}(one(UInt32), in_width)
416-
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, out_width)
415+
@inbounds for i in 1:in_width
416+
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, out_width)
417417
val = Δv[i]
418418
dxv[ow0] += w0λ * val
419419
dxv[ow1] += w1λ * val
420420
end
421421
end
422422

423423
# Linear (GPU): parallelization along width dimension.
424-
# TODO replace AbstractArray -> AnyGPUArray once device arrays subtype it.
425424

426425
@kernel function _upsample_linear_kernel!(::B, y::T, x::T, rwidth, align::Val{A}) where {
427426
B <: GPU, T <: AbstractArray{<:Any, 3}, A,
428427
}
429-
@uniform in_width::UInt32, channels::UInt32, batch::UInt32 = size(x)
430-
i::UInt32 = @index(Global)
431-
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, in_width)
432-
@inbounds for n in UnitRange{UInt32}(one(UInt32), batch), c in UnitRange{UInt32}(one(UInt32), channels)
428+
@uniform in_width, channels, batch = size(x)
429+
i = @index(Global)
430+
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, in_width)
431+
@inbounds for n in 1:batch, c in 1:channels
433432
y[i, c, n] = w0λ * x[iw0, c, n] + w1λ * x[iw1, c, n]
434433
end
435434
end
436435

437436
@kernel function _∇upsample_linear_kernel!(::B, dx::T, Δ::T, rwidth, align::Val{A}) where {
438437
B <: GPU, T <: AbstractArray{<:Any, 3}, A,
439438
}
440-
@uniform in_width::UInt32, channels::UInt32, batch::UInt32 = size(Δ)
441-
@uniform out_width::UInt32 = size(dx, 1)
442-
i::UInt32 = @index(Global)
443-
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, out_width)
444-
@inbounds for n in UnitRange{UInt32}(one(UInt32), batch), c in UnitRange{UInt32}(one(UInt32), channels)
439+
@uniform in_width, channels, batch = size(Δ)
440+
@uniform out_width = size(dx, 1)
441+
i = @index(Global)
442+
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, out_width)
443+
@inbounds for n in 1:batch, c in 1:channels
445444
val = Δ[i, c, n]
446445
@atomic dx[ow0, c, n] += w0λ * val
447446
@atomic dx[ow1, c, n] += w1λ * val
@@ -453,14 +452,14 @@ end
453452
@kernel function _upsample_linear_kernel!(::CPU, y::T, x::T, rwidth, rheight, align::Val{A}) where {
454453
T <: AbstractArray{<:Any, 4}, A,
455454
}
456-
@uniform in_width::UInt32, in_height::UInt32, channels::UInt32, batch::UInt32 = size(x)
457-
@uniform out_width::UInt32, out_height::UInt32 = size(y)[1:2]
458-
c::UInt32, n::UInt32 = @index(Global, NTuple)
455+
@uniform in_width, in_height, channels, batch = size(x)
456+
@uniform out_width, out_height = size(y)[1:2]
457+
c, n = @index(Global, NTuple)
459458
yv, xv = @view(y[:, :, c, n]), @view(x[:, :, c, n])
460-
for j in UnitRange{UInt32}(one(UInt32), out_height)
461-
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, in_height)
462-
for i in UnitRange{UInt32}(one(UInt32), out_width)
463-
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, in_width)
459+
for j in 1:out_height
460+
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, in_height)
461+
for i in 1:out_width
462+
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, in_width)
464463
@inbounds yv[i, j] =
465464
h0λ * (w0λ * xv[iw0, ih0] + w1λ * xv[iw1, ih0]) +
466465
h1λ * (w0λ * xv[iw0, ih1] + w1λ * xv[iw1, ih1])
@@ -471,14 +470,14 @@ end
471470
@kernel function _∇upsample_linear_kernel!(::CPU, dx::T1, Δ::T2, rwidth, rheight, align::Val{A}) where {
472471
T1 <: AbstractArray{<:Any, 4}, T2 <: AbstractArray{<:Any, 4}, A,
473472
}
474-
@uniform in_width::UInt32, in_height::UInt32, channels::UInt32, batch::UInt32 = size(Δ)
475-
@uniform out_width::UInt32, out_height::UInt32 = size(dx)[1:2]
476-
c::UInt32, n::UInt32 = @index(Global, NTuple)
473+
@uniform in_width, in_height, channels, batch = size(Δ)
474+
@uniform out_width, out_height = size(dx)[1:2]
475+
c, n = @index(Global, NTuple)
477476
Δv, dxv = @view(Δ[:, :, c, n]), @view(dx[:, :, c, n])
478-
for j in UnitRange{UInt32}(one(UInt32), in_height)
479-
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, out_height)
480-
@inbounds for i in UnitRange{UInt32}(one(UInt32), in_width)
481-
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, out_width)
477+
for j in 1:in_height
478+
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, out_height)
479+
@inbounds for i in 1:in_width
480+
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, out_width)
482481
val = Δv[i, j]
483482
dxv[ow0, oh0] += w0λ * h0λ * val
484483
dxv[ow1, oh0] += w1λ * h0λ * val
@@ -493,11 +492,11 @@ end
493492
@kernel function _upsample_linear_kernel!(::B, y::T, x::T, rwidth, rheight, align::Val{A}) where {
494493
B <: GPU, T <: AbstractArray{<:Any, 4}, A,
495494
}
496-
@uniform in_width::UInt32, in_height::UInt32, channels::UInt32, batch::UInt32 = size(x)
497-
i::UInt32, j::UInt32 = @index(Global, NTuple)
498-
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, in_width)
499-
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, in_height)
500-
@inbounds for n in UnitRange{UInt32}(one(UInt32), batch), c in UnitRange{UInt32}(one(UInt32), channels)
495+
@uniform in_width, in_height, channels, batch = size(x)
496+
i, j = @index(Global, NTuple)
497+
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, in_width)
498+
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, in_height)
499+
@inbounds for n in 1:batch, c in 1:channels
501500
y[i, j, c, n] =
502501
h0λ * (w0λ * x[iw0, ih0, c, n] + w1λ * x[iw1, ih0, c, n]) +
503502
h1λ * (w0λ * x[iw0, ih1, c, n] + w1λ * x[iw1, ih1, c, n])
@@ -507,12 +506,12 @@ end
507506
@kernel function _∇upsample_linear_kernel!(::B, dx::T, Δ::T, rwidth, rheight, align::Val{A}) where {
508507
B <: GPU, T <: AbstractArray{<:Any, 4}, A,
509508
}
510-
@uniform in_width::UInt32, in_height::UInt32, channels::UInt32, batch::UInt32 = size(Δ)
511-
@uniform out_width::UInt32, out_height::UInt32 = size(dx)[1:2]
512-
i::UInt32, j::UInt32 = @index(Global, NTuple)
513-
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, out_width)
514-
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, out_height)
515-
@inbounds for n in UnitRange{UInt32}(one(UInt32), batch), c in UnitRange{UInt32}(one(UInt32), channels)
509+
@uniform in_width, in_height, channels, batch = size(Δ)
510+
@uniform out_width, out_height = size(dx)[1:2]
511+
i, j = @index(Global, NTuple)
512+
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, out_width)
513+
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, out_height)
514+
@inbounds for n in 1:batch, c in 1:channels
516515
val = Δ[i, j, c, n]
517516
@atomic dx[ow0, oh0, c, n] += w0λ * h0λ * val
518517
@atomic dx[ow1, oh0, c, n] += w1λ * h0λ * val
@@ -526,17 +525,17 @@ end
526525
@kernel function _upsample_linear_kernel!(::CPU, y::T, x::T, rwidth, rheight, rdepth, align::Val{A}) where {
527526
T <: AbstractArray{<:Any, 5}, A,
528527
}
529-
@uniform in_width::UInt32, in_height::UInt32, in_depth::UInt32 = size(x)[1:3]
530-
@uniform channels::UInt32, batch::UInt32 = size(x, 4), size(x, 5)
531-
@uniform out_width::UInt32, out_height::UInt32, out_depth::UInt32 = size(y)[1:3]
532-
c::UInt32, n::UInt32 = @index(Global, NTuple)
528+
@uniform in_width, in_height, in_depth = size(x)[1:3]
529+
@uniform channels, batch = size(x, 4), size(x, 5)
530+
@uniform out_width, out_height, out_depth = size(y)[1:3]
531+
c, n = @index(Global, NTuple)
533532
yv, xv = @view(y[:, :, :, c, n]), @view(x[:, :, :, c, n])
534-
for k in UnitRange{UInt32}(one(UInt32), out_depth)
535-
id0, id1, d0λ, d1λ = source_idx_and_λ(rdepth, k - one(UInt32), align, in_depth)
536-
for j in UnitRange{UInt32}(one(UInt32), out_height)
537-
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, in_height)
538-
for i in UnitRange{UInt32}(one(UInt32), out_width)
539-
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, in_width)
533+
for k in 1:out_depth
534+
id0, id1, d0λ, d1λ = source_idx_and_λ(rdepth, k - 1, align, in_depth)
535+
for j in 1:out_height
536+
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, in_height)
537+
for i in 1:out_width
538+
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, in_width)
540539
@inbounds yv[i, j, k] =
541540
d0λ * (
542541
h0λ * (w0λ * xv[iw0, ih0, id0] + w1λ * xv[iw1, ih0, id0]) +
@@ -552,17 +551,17 @@ end
552551
@kernel function _∇upsample_linear_kernel!(::CPU, dx::T1, Δ::T2, rwidth, rheight, rdepth, align::Val{A}) where {
553552
T1 <: AbstractArray{<:Any, 5}, T2 <: AbstractArray{<:Any, 5}, A,
554553
}
555-
@uniform in_width::UInt32, in_height::UInt32, in_depth::UInt32 = size(Δ)[1:3]
556-
@uniform channels::UInt32, batch::UInt32 = size(Δ, 4), size(Δ, 5)
557-
@uniform out_width::UInt32, out_height::UInt32, out_depth::UInt32 = size(dx)[1:3]
558-
c::UInt32, n::UInt32 = @index(Global, NTuple)
554+
@uniform in_width, in_height, in_depth = size(Δ)[1:3]
555+
@uniform channels, batch = size(Δ, 4), size(Δ, 5)
556+
@uniform out_width, out_height, out_depth = size(dx)[1:3]
557+
c, n = @index(Global, NTuple)
559558
Δv, dxv = @view(Δ[:, :, :, c, n]), @view(dx[:, :, :, c, n])
560-
for k in UnitRange{UInt32}(one(UInt32), in_depth)
561-
od0, od1, d0λ, d1λ = source_idx_and_λ(rdepth, k - one(UInt32), align, out_depth)
562-
for j in UnitRange{UInt32}(one(UInt32), in_height)
563-
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, out_height)
564-
@inbounds for i in UnitRange{UInt32}(one(UInt32), in_width)
565-
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, out_width)
559+
for k in 1:in_depth
560+
od0, od1, d0λ, d1λ = source_idx_and_λ(rdepth, k - 1, align, out_depth)
561+
for j in 1:in_height
562+
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, out_height)
563+
@inbounds for i in 1:in_width
564+
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, out_width)
566565
val = Δv[i, j, k]
567566
dxv[ow0, oh0, od0] += w0λ * h0λ * d0λ * val
568567
dxv[ow1, oh0, od0] += w1λ * h0λ * d0λ * val
@@ -583,13 +582,13 @@ end
583582
@kernel function _upsample_linear_kernel!(::B, y::T, x::T, rwidth, rheight, rdepth, align::Val{A}) where {
584583
B <: GPU, T <: AbstractArray{<:Any, 5}, A,
585584
}
586-
@uniform in_width::UInt32, in_height::UInt32, in_depth::UInt32 = size(x)[1:3]
587-
@uniform channels::UInt32, batch::UInt32 = size(x, 4), size(x, 5)
588-
i::UInt32, j::UInt32, k::UInt32 = @index(Global, NTuple)
589-
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, in_width)
590-
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, in_height)
591-
id0, id1, d0λ, d1λ = source_idx_and_λ(rdepth, k - one(UInt32), align, in_depth)
592-
@inbounds for n in UnitRange{UInt32}(one(UInt32), batch), c in UnitRange{UInt32}(one(UInt32), channels)
585+
@uniform in_width, in_height, in_depth = size(x)[1:3]
586+
@uniform channels, batch = size(x, 4), size(x, 5)
587+
i, j, k = @index(Global, NTuple)
588+
iw0, iw1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, in_width)
589+
ih0, ih1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, in_height)
590+
id0, id1, d0λ, d1λ = source_idx_and_λ(rdepth, k - 1, align, in_depth)
591+
@inbounds for n in 1:batch, c in 1:channels
593592
y[i, j, k, c, n] =
594593
d0λ * (
595594
h0λ * (w0λ * x[iw0, ih0, id0, c, n] + w1λ * x[iw1, ih0, id0, c, n]) +
@@ -603,14 +602,14 @@ end
603602
@kernel function _∇upsample_linear_kernel!(::B, dx::T, Δ::T, rwidth, rheight, rdepth, align::Val{A}) where {
604603
B <: GPU, T <: AbstractArray{<:Any, 5}, A,
605604
}
606-
@uniform in_width::UInt32, in_height::UInt32, in_depth::UInt32 = size(Δ)[1:3]
607-
@uniform channels::UInt32, batch::UInt32 = size(Δ, 4), size(Δ, 5)
608-
@uniform out_width::UInt32, out_height::UInt32, out_depth::UInt32 = size(dx)[1:3]
609-
i::UInt32, j::UInt32, k::UInt32 = @index(Global, NTuple)
610-
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - one(UInt32), align, out_width)
611-
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - one(UInt32), align, out_height)
612-
od0, od1, d0λ, d1λ = source_idx_and_λ(rdepth, k - one(UInt32), align, out_depth)
613-
@inbounds for n in UnitRange{UInt32}(one(UInt32), batch), c in UnitRange{UInt32}(one(UInt32), channels)
605+
@uniform in_width, in_height, in_depth = size(Δ)[1:3]
606+
@uniform channels, batch = size(Δ, 4), size(Δ, 5)
607+
@uniform out_width, out_height, out_depth = size(dx)[1:3]
608+
i, j, k = @index(Global, NTuple)
609+
ow0, ow1, w0λ, w1λ = source_idx_and_λ(rwidth, i - 1, align, out_width)
610+
oh0, oh1, h0λ, h1λ = source_idx_and_λ(rheight, j - 1, align, out_height)
611+
od0, od1, d0λ, d1λ = source_idx_and_λ(rdepth, k - 1, align, out_depth)
612+
@inbounds for n in 1:batch, c in 1:channels
614613
val = Δ[i, j, k, c, n]
615614
@atomic dx[ow0, oh0, od0, c, n] += w0λ * h0λ * d0λ * val
616615
@atomic dx[ow1, oh0, od0, c, n] += w1λ * h0λ * d0λ * val
@@ -625,17 +624,21 @@ end
625624
end
626625

627626
@inline function source_idx_and_λ(
628-
ratio::T, out_idx::UInt32, ::Val{align}, in_width::UInt32,
627+
ratio::T, out_idx::Int, ::Val{align}, in_width::Int,
629628
) where {T, align}
630629
real_index = align ?
631630
ratio * out_idx :
632631
max(zero(T), ratio * (out_idx + T(0.5)) - T(0.5))
633632

634-
iw0 = floor(UInt32, real_index)
635-
offset::UInt32 = ifelse(iw0 < in_width - one(UInt32), one(UInt32), zero(UInt32))
636-
iw1 = iw0 + offset + one(UInt32)
633+
iw0 = if T <: Rational
634+
floor(Int, real_index) # Not GPU-friendly, but allows for Rational support.
635+
else
636+
unsafe_trunc(Int, floor(real_index))
637+
end
638+
offset = ifelse(iw0 < in_width - 1, 1, 0)
639+
iw1 = iw0 + offset + 1
637640

638641
w1lambda = real_index - iw0
639642
w0lambda = one(T) - w1lambda
640-
return iw0 + one(UInt32), iw1, w0lambda, w1lambda
643+
return iw0 + 1, iw1, w0lambda, w1lambda
641644
end

0 commit comments

Comments
 (0)