Skip to content

Accumulate crashes on CUDA with DecoupledLookback #74

@AntonReinhard

Description

@AntonReinhard

MWE:

using CUDA
using AcceleratedKernels

v = CUDA.ones(Int32, 2^15)
AcceleratedKernels.accumulate(+, v; init=Int32(0), alg=AcceleratedKernels.DecoupledLookback())

gives me a crash log:

julia> AcceleratedKernels.accumulate(+, v; init=Int32(0), alg=AcceleratedKernels.DecoupledLookback())
ERROR: LLVM error: Cannot select: 0x2c2518f0: ch = AtomicFence 0x2aac16f0, TargetConstant:i64<4>, TargetConstant:i64<1>, /home/reinha57/.julia/packages/UnsafeAtomics/vpyYB/src/core.jl:248 @[ /home/reinha57/.julia/packages/UnsafeAtomics/vpyYB/src/core.jl:12 @[ /home/reinha57/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate_1d_gpu.jl:174 @[ none:0 ] ] ]
  0x2ded3310: i64 = TargetConstant<4>
  0x2c2511f0: i64 = TargetConstant<1>
In function: _Z25gpu__accumulate_previous_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_10StaticSizeI6_256__ES8_vEE1_13CuDeviceArrayI5Int32Li1ELi1EESF_I5UInt8Li1ELi1EESH_
Stacktrace:
  [1] handle_error(reason::Cstring)
    @ LLVM ~/.julia/packages/LLVM/iza6e/src/core/context.jl:194
  [2] LLVMTargetMachineEmitToMemoryBuffer
    @ ~/.julia/packages/LLVM/iza6e/lib/18/libLLVM.jl:11531 [inlined]
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM ~/.julia/packages/LLVM/iza6e/src/targetmachine.jl:118
  [4] mcgen
    @ ~/.julia/packages/GPUCompiler/j4HFa/src/mcgen.jl:75 [inlined]
  [5] mcgen(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ CUDA ~/.julia/packages/CUDA/x8d2s/src/compiler/compilation.jl:127
  [6] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:438
  [7] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:115
  [8] compile_unhooked
    @ ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:80 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:67
 [10] compile
    @ ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:55 [inlined]
 [11] #compile##0
    @ ~/.julia/packages/CUDA/x8d2s/src/compiler/compilation.jl:250 [inlined]
 [12] JuliaContext(f::CUDA.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:34
 [13] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/driver.jl:25
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/x8d2s/src/compiler/compilation.jl:249
 [15] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/execution.jl:245
 [16] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/j4HFa/src/execution.jl:159
 [17] macro expansion
    @ ~/.julia/packages/CUDA/x8d2s/src/compiler/execution.jl:373 [inlined]
 [18] macro expansion
    @ ./lock.jl:376 [inlined]
 [19] cufunction(f::typeof(AcceleratedKernels.gpu__accumulate_previous!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{…}, typeof(+), CuDeviceVector{…}, CuDeviceVector{…}, CuDeviceVector{…}}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Int64})
    @ CUDA ~/.julia/packages/CUDA/x8d2s/src/compiler/execution.jl:368
 [20] macro expansion
    @ ~/.julia/packages/CUDA/x8d2s/src/compiler/execution.jl:112 [inlined]
 [21] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.StaticSize{(256,)}, KernelAbstractions.NDIteration.DynamicSize, typeof(AcceleratedKernels.gpu__accumulate_previous!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ CUDA.CUDAKernels ~/.julia/packages/CUDA/x8d2s/src/CUDAKernels.jl:127
 [22] accumulate_1d_gpu!(op::typeof(+), v::CuArray{Int32, 1, CUDA.DeviceMemory}, backend::CUDABackend, ::AcceleratedKernels.DecoupledLookback; init::Int32, neutral::Int32, inclusive::Bool, max_tasks::Int64, min_elems::Int64, block_size::Int64, temp::Nothing, temp_flags::Nothing)
    @ AcceleratedKernels ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate_1d_gpu.jl:307
 [23] accumulate_1d_gpu!
    @ ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate_1d_gpu.jl:257 [inlined]
 [24] _accumulate_impl!(op::typeof(+), v::CuArray{Int32, 1, CUDA.DeviceMemory}, backend::CUDABackend; init::Int32, neutral::Int32, dims::Nothing, inclusive::Bool, alg::AcceleratedKernels.DecoupledLookback, max_tasks::Int64, min_elems::Int64, prefer_threads::Bool, block_size::Int64, temp::Nothing, temp_flags::Nothing)
    @ AcceleratedKernels ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:171
 [25] #accumulate!#99
    @ ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:128 [inlined]
 [26] accumulate(op::Function, v::CuArray{Int32, 1, CUDA.DeviceMemory}, backend::CUDABackend; init::Int32, kwargs::@Kwargs{alg::AcceleratedKernels.DecoupledLookback})
    @ AcceleratedKernels ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:227
 [27] accumulate
    @ ~/.julia/packages/AcceleratedKernels/AdYRJ/src/accumulate/accumulate.jl:219 [inlined]
 [28] top-level scope
    @ REPL[4]:1
Some type information was truncated. Use `show(err)` to see complete types.
julia> versioninfo()
Julia Version 1.12.3
Commit 966d0af0fdf (2025-12-15 11:20 UTC)
Build Info:
  Official https://julialang.org release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 64 × AMD EPYC 7452 32-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-18.1.7 (ORCJIT, znver2)
  GC: Built with stock GC
Threads: 1 default, 1 interactive, 1 GC (on 64 virtual cores)

julia> CUDA.versioninfo()
CUDA toolchain: 
- runtime 13.0, artifact installation
- driver 580.95.5 for 13.0
- compiler 13.0

CUDA libraries: 
- CUBLAS: 13.1.0
- CURAND: 10.4.0
- CUFFT: 12.0.0
- CUSOLVER: 12.0.4
- CUSPARSE: 12.6.3
- CUPTI: 2025.3.1 (API 13.0.1)
- NVML: 13.0.0+580.95.5

Julia packages: 
- CUDA: 5.9.4
- CUDA_Driver_jll: 13.0.2+0
- CUDA_Compiler_jll: 0.3.0+0
- CUDA_Runtime_jll: 0.19.2+0

Toolchain:
- Julia: 1.12.3
- LLVM: 18.1.7

1 device:
  0: NVIDIA A30 (sm_80, 10.925 GiB / 24.000 GiB available)

relevant package versions:

(@v1.12) pkg> status
Status `~/.julia/environments/v1.12/Project.toml`
  [21141c5a] AMDGPU v2.1.4
  [6a4ca0a5] AcceleratedKernels v0.4.3
  [052768ef] CUDA v5.9.5
  [63c18a36] KernelAbstractions v0.9.39
  [295af30f] Revise v3.12.3

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions