diff --git a/Project.toml b/Project.toml index ea21d52..b51bb27 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" authors = ["Andrei-Leonard Nicusan and contributors"] -version = "0.3.0" +version = "0.3.1" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl index 15f47d3..e88fd18 100644 --- a/src/accumulate/accumulate.jl +++ b/src/accumulate/accumulate.jl @@ -71,11 +71,11 @@ For compatibility with the `Base.accumulate!` function, we provide the two-array we do not need the constraint of `dst` and `src` being different; to minimise memory use, we recommend using the single-array interface (the first one above). -## CPU +## CPU The CPU implementation is currently single-threaded; we are waiting on a multithreaded implementation in OhMyThreads.jl ([issue](https://github.com/JuliaFolds2/OhMyThreads.jl/issues/129)). -## GPU +## GPU For the 1D case (`dims=nothing`), the `alg` can be one of the following: - `DecoupledLookback()`: the default algorithm, using opportunistic lookback to reuse earlier blocks' results; requires device-level memory consistency guarantees, which Apple Metal does not @@ -241,7 +241,7 @@ function accumulate( temp::Union{Nothing, AbstractArray}=nothing, temp_flags::Union{Nothing, AbstractArray}=nothing, ) - dst_type = promote_type(eltype(v), typeof(init)) + dst_type = Base.promote_op(op, eltype(v), typeof(init)) vcopy = similar(v, dst_type) copyto!(vcopy, v) accumulate!( @@ -252,7 +252,7 @@ function accumulate( inclusive=inclusive, alg=alg, - + block_size=block_size, temp=temp, temp_flags=temp_flags,