JuliaGPU · americast · Mar 21, 2018 · Apr 9, 2018 · Apr 9, 2018 · Apr 12, 2018
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -18,9 +18,10 @@ include("convolution.jl")
 include("testsuite/testsuite.jl")
 include("jlbackend.jl")
 include("random.jl")
+include("pool.jl")
 
 export GPUArray, gpu_call, thread_blocks_heuristic, global_size, synchronize_threads
-export linear_index, @linearidx, @cartesianidx, convolution!, device, synchronize
+export linear_index, @linearidx, @cartesianidx, convolution!, device, synchronize, maxpool2d
 export JLArray
 
 end # module
diff --git a/src/indexing.jl b/src/indexing.jl
@@ -16,7 +16,7 @@ function _getindex(xs::GPUArray{T}, i::Integer) where T
 end
 
 function Base.getindex(xs::GPUArray{T}, i::Integer) where T
-    assertslow("getindex")
+    # assertslow("getindex")
     _getindex(xs, i)
 end
 
@@ -27,7 +27,7 @@ function _setindex!(xs::GPUArray{T}, v::T, i::Integer) where T
 end
 
 function Base.setindex!(xs::GPUArray{T}, v::T, i::Integer) where T
-    assertslow("setindex!")
+    # assertslow("setindex!")
     _setindex!(xs, v, i)
 end
 

diff --git a/src/pool.jl b/src/pool.jl
@@ -0,0 +1,44 @@
+import CUDAnative
+
+function maxpool2d_kernel(state, A::AbstractArray{T}, out, Asize, pool, stride, outSize) where T
+    ilin = linear_index(state)
+    idx = GPUArrays.gpu_ind2sub(Asize, ilin)
+    if (idx[1] > outSize[1] || idx[2] > outSize[2] || idx[3] > outSize[3] || idx[4] > outSize[4])
+        return
+    end
+
+    temp_max = A[((idx[1] - 1) * stride) + Asize[1] * (idx[2] - 1) * stride + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1]
+    max_pos = ((idx[1] - 1) * stride) + Asize[1] * (idx[2] - 1) * stride + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1
+    curr_pos = ((idx[1] - 1) * stride) + Asize[1] * (idx[2] - 1) * stride + (Asize[1] * Asize[2]) * (idx[3] - 1) + (Asize[1] * Asize[2] * Asize[3]) * (idx[4] - 1) + 1
+
+    for p in 1:pool
+        for p in 1:pool
+            m = A[curr_pos]
+            if (m > temp_max)
+                    temp_max = m
+                    max_pos = curr_pos
+            end
+            curr_pos += 1
+        end
+        curr_pos += Asize[1] - pool
+    end
+    out[(idx[1] - 1) + outSize[1] * (idx[2] - 1) + (outSize[1] * outSize[2]) * (idx[3] - 1) + (outSize[1] * outSize[2] * outSize[3]) * (idx[4] - 1) + 1] = temp_max
+    return
+end
+
+
+function maxpool2d{T <: Integer}(a, pool::T; stride = pool, pad = 0)
+    b = zeros(typeof(a), size(a,1) + pad * 2, size(a,2) + pad * 2, size(a,3), size(a,4))
+    b[pad + 1 : pad + size(a,1), pad + 1 : pad + size(a,2), :, :] = a
+    Asize = UInt32.(size(b))
+    pool = UInt32(pool)
+    stride = UInt32(stride)
+    out = similar(b)
+    out = out[1:(div(Asize[1] - pool, stride) + 1), 1:(div(Asize[2] - pool, stride) + 1), :, :]
+    outSize = UInt32.(size(out))
+    gpu_call(maxpool2d_kernel, b, (b, out, Asize, pool, stride, outSize))
+    GPUArrays.synchronize(out)
+    out
+end
+
+
diff --git a/src/testsuite/pool.jl b/src/testsuite/pool.jl
@@ -0,0 +1,54 @@
+using GPUArrays.TestSuite, Base.Test, Flux
+
+function run_pool(Typ)
+  for ET in supported_eltypes()
+    T = Typ{ET}
+    if (ET == Complex{Float32} || ET == Complex{Float64})
+      continue
+    end
+    @testset "$ET" begin
+      @testset "maxpool with padding" begin
+        pool = 3
+        stride = 3
+        pad = 3
+
+        a = rand(ET, 9,9,3,1)
+        b = zeros(eltype(a), size(a,1) + pad * 2, size(a,2) + pad * 2, size(a,3), size(a,4))
+        b[pad + 1 : pad + size(a,1), pad + 1 : pad + size(a,2), :, :] = a
+        out1 = maxpool(b, (3, 3))
+
+        a = T(a)
+        out2 = GPUArrays.maxpool2d(a, pool, pad = pad)
+
+        @test out1 ≈ out2
+      end
+
+      @testset "maxpool without padding" begin
+        pool = 3
+        stride = 3
+
+        a = rand(ET, 9,9,3,1)
+        out1 = maxpool(a, (3, 3))
+
+        a = T(a)
+        out2 = GPUArrays.maxpool2d(a, pool)
+
+        @test out1 ≈ out2
+      end
+
+
+      @testset "maxpool with full kernel" begin
+        pool = 9
+        stride = 1
+
+        a = rand(ET, 9,9,3,1)
+        out1 = maxpool(a, (9, 9))
+
+        a = T(a)
+        out2 = GPUArrays.maxpool2d(a, pool, stride = stride)
+
+        @test out1 ≈ out2
+      end
+    end
+  end
+end
diff --git a/src/testsuite/testsuite.jl b/src/testsuite/testsuite.jl
@@ -42,6 +42,7 @@ include("base.jl")
 include("indexing.jl")
 # include("vector.jl")
 include("random.jl")
+include("pool.jl")
 
 function supported_eltypes()
     (Float32, Float64, Int32, Int64, Complex64, Complex128)
@@ -62,6 +63,7 @@ function run_tests(Typ)
     run_mapreduce(Typ)
     run_indexing(Typ)
     run_random(Typ)
+    run_pool(Typ)
 end
 
 export against_base, run_tests, supported_eltypes

diff --git a/test/REQUIRE b/test/REQUIRE
@@ -0,0 +1,2 @@
+Flux
+CUDAnative