Skip to content

Statistics.mean(Float32, arr, dims=1) throws KernelError #611

@Yuan-Ru-Lin

Description

@Yuan-Ru-Lin
julia> using CUDA, Statistics

julia> a_gpu = CUDA.rand(Int32, 100, 100)
julia> m_gpu = mean(Float32, a_gpu, dims=1)

resulted in the following error.

GPU compilation of MethodInstance for CUDA.partial_mapreduce_grid(::ComposedFunction{Base.Fix1{typeof(*), Float64}, Type{Float32}}, ::typeof(Base.add_sum), ::Float64, ::CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, ::CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, ::Val{true}, ::CuDeviceMatrix{Float64, 1}, ::CuDeviceMatrix{Int32, 1}) failed
KernelError: passing non-bitstype argument

Argument 2 to your kernel function is of type ComposedFunction{Base.Fix1{typeof(*), Float64}, Type{Float32}}, which is not a bitstype:
  .inner is of type Type{Float32} which is not isbits.


Only bitstypes, which are "plain data" types that are immutable
and contain no references to other values, can be used in GPU kernels.
For more information, see the `Base.isbitstype` function.

Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/validation.jl:108
  [2] macro expansion
    @ /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:87 [inlined]
  [3] macro expansion
    @ /pscratch/sd/y/yuanru/.julia/packages/Tracy/slmNc/src/tracepoint.jl:163 [inlined]
  [4] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:85
  [5] compile_unhooked
    @ /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:80 [inlined]
  [6] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:67
  [7] compile
    @ /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:55 [inlined]
  [8] #1182
    @ /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/compiler/compilation.jl:250 [inlined]
  [9] JuliaContext(f::CUDA.var"#1182#1185"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:34
 [10] JuliaContext(f::Function)
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/driver.jl:25
 [11] compile(job::GPUCompiler.CompilerJob)
    @ CUDA /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/compiler/compilation.jl:249
 [12] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/execution.jl:245
 [13] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler /pscratch/sd/y/yuanru/.julia/packages/GPUCompiler/Ecaql/src/execution.jl:159
 [14] macro expansion
    @ /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/compiler/execution.jl:373 [inlined]
 [15] macro expansion
    @ ./lock.jl:267 [inlined]
 [16] cufunction(f::typeof(CUDA.partial_mapreduce_grid), tt::Type{Tuple{ComposedFunction{Base.Fix1{typeof(*), Float64}, Type{Float32}}, typeof(Base.add_sum), Float64, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float64, 1}, CuDeviceMatrix{Int32, 1}}}; kwargs::@Kwargs{})
    @ CUDA /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/compiler/execution.jl:368
 [17] cufunction(f::typeof(CUDA.partial_mapreduce_grid), tt::Type{Tuple{ComposedFunction{Base.Fix1{typeof(*), Float64}, Type{Float32}}, typeof(Base.add_sum), Float64, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, CartesianIndices{2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, Val{true}, CuDeviceMatrix{Float64, 1}, CuDeviceMatrix{Int32, 1}}})
    @ CUDA /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/compiler/execution.jl:365
 [18] macro expansion
    @ /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/compiler/execution.jl:112 [inlined]
 [19] mapreducedim!(f::ComposedFunction{Base.Fix1{typeof(*), Float64}, Type{Float32}}, op::typeof(Base.add_sum), R::CuArray{Float64, 2, CUDA.DeviceMemory}, A::CuArray{Int32, 2, CUDA.DeviceMemory}; init::Float64)
    @ CUDA /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/mapreduce.jl:229
 [20] mapreducedim!
    @ /pscratch/sd/y/yuanru/.julia/packages/CUDA/ja0IX/src/mapreduce.jl:169 [inlined]
 [21] _mapreduce(f::ComposedFunction{Base.Fix1{typeof(*), Float64}, Type{Float32}}, op::typeof(Base.add_sum), As::CuArray{Int32, 2, CUDA.DeviceMemory}; dims::Int64, init::Nothing)
    @ GPUArrays /pscratch/sd/y/yuanru/.julia/packages/GPUArrays/u6tui/src/host/mapreduce.jl:76
 [22] mapreduce(::Function, ::Function, ::CuArray{Int32, 2, CUDA.DeviceMemory}; dims::Int64, init::Nothing)
    @ GPUArrays /pscratch/sd/y/yuanru/.julia/packages/GPUArrays/u6tui/src/host/mapreduce.jl:28
 [23] mapreduce
    @ /pscratch/sd/y/yuanru/.julia/packages/GPUArrays/u6tui/src/host/mapreduce.jl:28 [inlined]
 [24] _sum
    @ ./reducedim.jl:1041 [inlined]
 [25] sum
    @ ./reducedim.jl:1013 [inlined]
 [26] _mean
    @ /pscratch/sd/y/yuanru/.julia/packages/GPUArrays/u6tui/src/host/statistics.jl:37 [inlined]
 [27] #mean#1
    @ /global/cfs/cdirs/m2676/users/yuanru/.juliaup/juliaup/julia-1.10.10+0.x64.linux.gnu/share/julia/stdlib/v1.10/Statistics/src/Statistics.jl:104 [inlined]
 [28] top-level scope
    @ REPL[5]:1
 [29] top-level scope
    @ none:1

Note that it works if I remove either Float32 or dims=1

julia> m = mean(Float32, a)
2.6236948f6
julia> m = mean(a, dims=1)
1×100 CuArray{Float64, 2, CUDA.DeviceMemory}:
 6.44418e7  1.63643e8  5.03201e7  4.19325e7  -9.81884e7  -1.30971e8  -1.48728e8  …  1.86605e7  -5.30929e6  9.33825e7  -6.25027e7  8.92984e7  5.62057e7  6.36258e7

The CUDA version is v5.8.2.

Version Info
Julia Version 1.10.10
Commit 95f30e51f41 (2025-06-27 09:51 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 256 × AMD EPYC 7713 64-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
Threads: 1 default, 0 interactive, 1 GC (on 256 virtual cores)
Environment:
  JULIA_PROJECT = @work
  JULIA_DEPOT_PATH = /pscratch/sd/y/yuanru/.julia

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions