diff --git a/lib/JLArrays/Project.toml b/lib/JLArrays/Project.toml index 700a31aa..ec263f7c 100644 --- a/lib/JLArrays/Project.toml +++ b/lib/JLArrays/Project.toml @@ -1,7 +1,7 @@ name = "JLArrays" uuid = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" authors = ["Tim Besard "] -version = "0.3.0" +version = "0.3.1" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 4b238fa0..8f2fe024 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -377,7 +377,7 @@ KernelAbstractions.allocate(::JLBackend, ::Type{T}, dims::Tuple) where T = JLArr end if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing - workgroupsize = (1024,) # Vectorization, 4x unrolling, minimal grain size + workgroupsize = (MAXTHREADS,) # Vectorization, 4x unrolling, minimal grain size end iterspace, dynamic = partition(kernel, ndrange, workgroupsize) # partition checked that the ndrange's agreed @@ -403,6 +403,7 @@ else end function (obj::Kernel{JLBackend})(args...; ndrange=nothing, workgroupsize=nothing) + ndrange, workgroupsize, _, _ = launch_config(obj, ndrange, workgroupsize) device_args = jlconvert.(args) new_obj = convert_to_cpu(obj) new_obj(device_args...; ndrange, workgroupsize) diff --git a/src/host/random.jl b/src/host/random.jl index 256ecea9..7759e841 100644 --- a/src/host/random.jl +++ b/src/host/random.jl @@ -96,9 +96,10 @@ function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number threads = (length(A) - 1) รท 2 + 1 @kernel function randn!(a, randstates) i = @index(Global, Linear) + threadidx = @index(Local, Linear) idx = 2*(i - 1) + 1 - U1 = gpu_rand(T, i, randstates) - U2 = gpu_rand(T, i, randstates) + U1 = gpu_rand(T, threadidx, randstates) + U2 = gpu_rand(T, threadidx, randstates) Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2) Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2) @inbounds a[idx] = Z0 diff --git a/test/testsuite/random.jl b/test/testsuite/random.jl index f2cf832a..6ecd235f 100644 --- a/test/testsuite/random.jl +++ b/test/testsuite/random.jl @@ -6,20 +6,25 @@ end cpu_rng = Random.default_rng() + SEEDING_BROKEN = (rng != cpu_rng) && !contains(string(AT), "JLArray") + @testset "rand" begin # uniform - for T in eltypes, d in (10, (10,10)) + @testset "$d $T" for T in eltypes, d in (10, (10, 10), (1024, 1024)) A = AT{T}(undef, d) B = copy(A) rand!(rng, A) rand!(rng, B) @test Array(A) != Array(B) + A = AT(rand(T, d)) + B = AT(rand(T, d)) + Random.seed!(rng) Random.seed!(rng, 1) rand!(rng, A) Random.seed!(rng, 1) rand!(rng, B) - @test all(Array(A) .== Array(B)) + @test Array(A) == Array(B) broken=SEEDING_BROKEN && (prod(d) > length(rng.state)) if rng != cpu_rng rand!(cpu_rng, A) @@ -44,19 +49,22 @@ @testset "randn" begin # normally-distributed # XXX: randn calls sqrt, and Base's sqrt(::Complex) performs # checked type conversions that throw boxed numbers. - for T in filter(isrealfloattype, eltypes), d in (2, (2,2)) + @testset "$d $T" for T in filter(isrealfloattype, eltypes), d in (2, (2, 2), (1024, 1024)) A = AT{T}(undef, d) B = copy(A) randn!(rng, A) randn!(rng, B) @test Array(A) != Array(B) + A = AT(rand(T, d)) + B = AT(rand(T, d)) + Random.seed!(rng) Random.seed!(rng, 1) randn!(rng, A) Random.seed!(rng, 1) randn!(rng, B) - @test Array(A) == Array(B) + @test Array(A) == Array(B) broken=SEEDING_BROKEN && (prod(d) > (2 * length(rng.state))) if rng != cpu_rng randn!(cpu_rng, A)