diff --git a/Project.toml b/Project.toml
index 25894ff..ea4fb68 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
-version = "0.4.1"
+version = "0.4.2"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
index 8aaa83e..aeb08ca 100644
--- a/src/accumulate/accumulate_nd.jl
+++ b/src/accumulate/accumulate_nd.jl
@@ -227,7 +227,7 @@ end
     # We have a block of threads to accumulate along the dims axis; do it in chunks of
     # block_size and keep track of previous chunks' running prefix
     ichunk = typeof(iblock)(0)
-    num_chunks = (length_dims + block_size - 0x1) ÷ block_size
+    num_chunks = (length_dims + (0x2 * block_size) - 0x1) ÷ (0x2 * block_size)
     total = neutral
 
     if ithread == 0x0
@@ -326,7 +326,7 @@ end
 
             # ...and accumulate the last value too
             if bi == 0x2 * block_size - 0x1
-                if iblock < num_chunks - 0x1
+                if ichunk < num_chunks - 0x1
                     temp[bi + bank_offset_b + 0x1] = op(t2, v[
                         input_base_idx +
                         ((ichunk + 0x1) * block_size * 0x2 - 0x1) * vstrides[dims] +
diff --git a/test/accumulate.jl b/test/accumulate.jl
index f3be80a..f83e514 100644
--- a/test/accumulate.jl
+++ b/test/accumulate.jl
@@ -192,6 +192,17 @@ end
     # Test that undefined kwargs are not accepted
     @test_throws MethodError AK.accumulate(+, v; init=10, dims=2, inclusive=false, bad=:kwarg)
 
+    # Test all options with bigger matrices
+    for D in [(1_000_000,3), (3,1_000_000)], dims in [1,2]
+        @testset let D = D, dims = dims
+            vh = ones(Float32, D)
+            v = array_from_host(vh)
+            s = AK.accumulate(+, v; init=0, dims)
+            sh = Array(s)
+            @test sh == accumulate(+, vh; init=0, dims)
+        end
+    end
+
     # Testing different settings
     AK.accumulate(
         (x, y) -> x + 1,