diff --git a/Project.toml b/Project.toml index 25894ff..ea4fb68 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" authors = ["Andrei-Leonard Nicusan and contributors"] -version = "0.4.1" +version = "0.4.2" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl index 8aaa83e..aeb08ca 100644 --- a/src/accumulate/accumulate_nd.jl +++ b/src/accumulate/accumulate_nd.jl @@ -227,7 +227,7 @@ end # We have a block of threads to accumulate along the dims axis; do it in chunks of # block_size and keep track of previous chunks' running prefix ichunk = typeof(iblock)(0) - num_chunks = (length_dims + block_size - 0x1) ÷ block_size + num_chunks = (length_dims + (0x2 * block_size) - 0x1) ÷ (0x2 * block_size) total = neutral if ithread == 0x0 @@ -326,7 +326,7 @@ end # ...and accumulate the last value too if bi == 0x2 * block_size - 0x1 - if iblock < num_chunks - 0x1 + if ichunk < num_chunks - 0x1 temp[bi + bank_offset_b + 0x1] = op(t2, v[ input_base_idx + ((ichunk + 0x1) * block_size * 0x2 - 0x1) * vstrides[dims] + diff --git a/test/accumulate.jl b/test/accumulate.jl index f3be80a..f83e514 100644 --- a/test/accumulate.jl +++ b/test/accumulate.jl @@ -192,6 +192,17 @@ end # Test that undefined kwargs are not accepted @test_throws MethodError AK.accumulate(+, v; init=10, dims=2, inclusive=false, bad=:kwarg) + # Test all options with bigger matrices + for D in [(1_000_000,3), (3,1_000_000)], dims in [1,2] + @testset let D = D, dims = dims + vh = ones(Float32, D) + v = array_from_host(vh) + s = AK.accumulate(+, v; init=0, dims) + sh = Array(s) + @test sh == accumulate(+, vh; init=0, dims) + end + end + # Testing different settings AK.accumulate( (x, y) -> x + 1,