diff --git a/Project.toml b/Project.toml
index 0051650a..c7e78589 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,8 +3,10 @@ uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "11.2.3"
 
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -22,8 +24,10 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JLD2Ext = "JLD2"
 
 [compat]
+AcceleratedKernels = "0.4"
 Adapt = "4.0"
 GPUArraysCore = "= 0.2.0"
+GPUToolbox = "0.2, 0.3"
 JLD2 = "0.4, 0.5"
 KernelAbstractions = "0.9.28"
 LLVM = "3.9, 4, 5, 6, 7, 8, 9"
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 8c1fc14e..ad4cad6d 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -1,5 +1,6 @@
 module GPUArrays
 
+using GPUToolbox
 using KernelAbstractions
 using Serialization
 using Random
@@ -16,6 +17,7 @@ using Reexport
 @reexport using GPUArraysCore
 
 using KernelAbstractions
+import AcceleratedKernels as AK
 
 # device functionality
 include("device/abstractarray.jl")
@@ -26,6 +28,7 @@ include("host/construction.jl")
 ## integrations and specialized methods
 include("host/base.jl")
 include("host/indexing.jl")
+include("host/reverse.jl")
 include("host/broadcast.jl")
 include("host/mapreduce.jl")
 include("host/linalg.jl")
diff --git a/src/host/reverse.jl b/src/host/reverse.jl
new file mode 100644
index 00000000..f0b1caf7
--- /dev/null
+++ b/src/host/reverse.jl
@@ -0,0 +1,148 @@
+# reversing
+
+# the kernel works by treating the array as 1d. after reversing by dimension x an element at
+# pos [i1, i2, i3, ... , i{x},            ..., i{n}] will be at
+# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x
+
+# out-of-place version, copying a single value per thread from input to output
+function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
+                  dims=1:ndims(input)) where {T, N}
+    @assert size(input) == size(output)
+    rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N)
+    ref = size(input) .+ 1
+    # converts an ND-index in the data array to the linear index
+    lin_idx = LinearIndices(input)
+    # converts a linear index in a reduced array to an ND-index, but using the reduced size
+    nd_idx = CartesianIndices(input)
+
+    ## COV_EXCL_START
+    @kernel unsafe_indices=true function kernel(input, output)
+        offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
+        index_in = offset_in + @index(Local, Linear)
+
+        @inbounds if index_in <= length(input)
+            idx = Tuple(nd_idx[index_in])
+            idx = ifelse.(rev_dims, ref .- idx, idx)
+            index_out =  lin_idx[idx...]
+            output[index_out] = input[index_in]
+        end
+    end
+    ## COV_EXCL_STOP
+
+    nthreads = 256
+
+    kernel(get_backend(input))(input, output; workgroupsize=nthreads, ndrange=length(input))
+end
+
+# in-place version, swapping elements on half the number of threads
+function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
+    rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N)
+    half_dim = findlast(rev_dims)
+    if isnothing(half_dim)
+        # no reverse operation needed at all in this case.
+        return
+    end
+    ref = size(data) .+ 1
+    # converts an ND-index in the data array to the linear index
+    lin_idx = LinearIndices(data)
+    reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N)
+    reduced_length = prod(reduced_size)
+    # converts a linear index in a reduced array to an ND-index, but using the reduced size
+    nd_idx = CartesianIndices(reduced_size)
+
+    ## COV_EXCL_START
+    @kernel unsafe_indices=true function kernel(data)
+        offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
+        index_in = offset_in + @index(Local, Linear)
+
+        @inbounds if index_in <= reduced_length
+            idx = Tuple(nd_idx[index_in])
+            index_in = lin_idx[idx...]
+            idx = ifelse.(rev_dims, ref .- idx, idx)
+            index_out =  lin_idx[idx...]
+
+            if index_in < index_out
+                temp = data[index_out]
+                data[index_out] = data[index_in]
+                data[index_in] = temp
+            end
+        end
+    end
+    ## COV_EXCL_STOP
+
+    # NOTE: we launch slightly more than half the number of elements in the array as threads.
+    # The last non-singleton dimension along which to reverse is used to define how the array is split.
+    # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
+    # ignoring the threads that cross the mid-point
+
+    nthreads = 256
+
+    kernel(get_backend(data))(data; workgroupsize=nthreads, ndrange=length(data))
+end
+
+
+# n-dimensional API
+
+function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N}
+    if isa(dims, Colon)
+        dims = 1:ndims(data)
+    end
+    if !applicable(iterate, dims)
+        throw(ArgumentError("dimension $dims is not an iterable"))
+    end
+    if !all(1 .≤ dims .≤ ndims(data))
+        throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))"))
+    end
+
+    _reverse!(data; dims=dims)
+
+    return data
+end
+
+# out-of-place
+function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N}
+    if isa(dims, Colon)
+        dims = 1:ndims(input)
+    end
+    if !applicable(iterate, dims)
+        throw(ArgumentError("dimension $dims is not an iterable"))
+    end
+    if !all(1 .≤ dims .≤ ndims(input))
+        throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))"))
+    end
+
+    if all(size(input)[[dims...]].==1)
+        # no reverse operation needed at all in this case.
+        return copy(input)
+    else
+        output = similar(input)
+        _reverse(input, output; dims=dims)
+        return output
+    end
+end
+
+
+# 1-dimensional API
+
+# in-place
+Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer,
+                                                stop::Integer=length(data)) where {T}
+    _reverse!(view(data, start:stop))
+    return data
+end
+
+Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, length(data))
+
+# out-of-place
+Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer,
+                                               stop::Integer=length(input)) where {T}
+    output = similar(input)
+
+    start > 1 && copyto!(output, 1, input, 1, start-1)
+    _reverse(view(input, start:stop), view(output, start:stop))
+    stop < length(input) && copyto!(output, stop+1, input, stop+1)
+
+    return output
+end
+
+Base.reverse(data::AnyGPUVector{T}) where {T} = @inbounds reverse(data, 1, length(data))
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
index 6bcfd5b4..55436126 100644
--- a/test/testsuite/base.jl
+++ b/test/testsuite/base.jl
@@ -381,6 +381,51 @@ end
       gA = reshape(AT(A),4)
     end
 
+    @testset "reverse" begin
+      # 1-d out-of-place
+      @test compare(x->reverse(x), AT, rand(Float32, 1000))
+      @test compare(x->reverse(x, 10), AT, rand(Float32, 1000))
+      @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000))
+
+      # 1-d in-place
+      @test compare(x->reverse!(x), AT, rand(Float32, 1000))
+      @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000))
+      @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000))
+
+      # n-d out-of-place
+      for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
+          dim in 1:length(shape)
+        @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
+
+        cpu = rand(Float32, shape...)
+        gpu = AT(cpu)
+        reverse!(gpu; dims=dim)
+        @test Array(gpu) == reverse(cpu; dims=dim)
+      end
+
+      # supports multidimensional reverse
+      for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]),
+          dim in ((1,2),(2,3),(1,3),:)
+        @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
+
+        cpu = rand(Float32, shape...)
+        gpu = AT(cpu)
+        reverse!(gpu; dims=dim)
+        @test Array(gpu) == reverse(cpu; dims=dim)
+      end
+
+      # wrapped array
+      @test compare(x->reverse(x), AT, reshape(rand(Float32, 2,2), 4))
+
+      # error throwing
+      cpu = rand(Float32, 1,2,3,4)
+      gpu = AT(cpu)
+      @test_throws ArgumentError reverse!(gpu, dims=5)
+      @test_throws ArgumentError reverse!(gpu, dims=0)
+      @test_throws ArgumentError reverse(gpu, dims=5)
+      @test_throws ArgumentError reverse(gpu, dims=0)
+  end
+
     @testset "reinterpret" begin
       A = Int32[-1,-2,-3]
       dA = AT(A)