`indice` in _forindices_global! should be wrapped by Val.

Hi, all,
As the title suggests, I think indice should be wrapped with `Val` because in some cases the register usage becomes significantly higher — up to 2.5×.
This overhead seems to occur only when indices is non-contiguous. 

My personal opinion is that although wrapping with `Val` triggers recompilation every time the indices change, in large-scale computations the indices often remain constant across many kernel launches. Also, non-contiguous indices are sometimes used to simplify algorithms or to express patterns more clearly. So I believe this trade-off is acceptable and may even be preferable in many realistic settings.

Any thoughts or confirmation would be appreciated.

If I'm mistaken, feel free to ignore it.
Thanks!

Below is a minimal working example comparing two cases:
```julia-repl
julia> using KernelAbstractions

julia> using CUDA

julia> @kernel inbounds = true cpu = false unsafe_indices = true function _forindices_global!(f, indices)
           N = @groupsize()[1]
           iblock = @index(Group, Linear)
           ithread = @index(Local, Linear)
           i = ithread + (iblock - 0x1) * N

           if i <= length(indices)
               f(indices[i])
           end
       end

julia> function _forindices_gpu(f, indices, backend::GPU; block_size::Int=256)
           blocks = (length(indices) + block_size - 1) ÷ block_size
           _forindices_global!(backend, block_size)(f, indices; ndrange=(block_size * blocks,))
           nothing
       end
_forindices_gpu (generic function with 1 method)

julia> function test(X)
           _forindices_gpu(1:2:length(X), get_backend(X)) do i
               X[i] = 1
           end
       end
test (generic function with 1 method)

julia> X = CUDA.zeros(4)
4-element CuArray{Float32, 1, CUDA.DeviceMemory}:
 0.0
 0.0
 0.0
 0.0

julia> CUDA.@profile raw = true trace = true test(X)
Profiler ran for 237.38 ms, capturing 84 events.

...
Device-side activity: GPU was busy for 18.36 µs (0.01% of the trace)
┌──────┬───────────┬───────────┬────────────────────────────┬────────┬─────────┬────────┬──────┬─────────────────────────────────┬──────────────────────┬─────────┬──
│   ID │     Start │      Time │                     Device │ Stream │ Threads │ Blocks │ Regs │                      Shared Mem │            Local Mem │    Size │ ⋯
├──────┼───────────┼───────────┼────────────────────────────┼────────┼─────────┼────────┼──────┼─────────────────────────────────┼──────────────────────┼─────────┼──
│ 3100 │   2.58 ms │ 476.84 ns │ NVIDIA GeForce RTX 4070 Ti │     13 │       - │      - │    - │                               - │                    - │ 8 bytes │ ⋯
│ 3143 │ 237.32 ms │  17.88 µs │ NVIDIA GeForce RTX 4070 Ti │     13 │     256 │      1 │   38 │ 0 bytes static, 0 bytes dynamic │ 0 bytes / 75.000 MiB │       - │ ⋯
└──────┴───────────┴───────────┴────────────────────────────┴────────┴─────────┴────────┴──────┴─────────────────────────────────┴──────────────────────┴─────────┴──
                                                                                                                                                    2 columns omitted


julia> @kernel inbounds = true cpu = false unsafe_indices = true function _forindices_global!(
           f, ::Val{indices}
       ) where {indices}
           N = @groupsize()[1]
           iblock = @index(Group, Linear)
           ithread = @index(Local, Linear)
           i = ithread + (iblock - 0x1) * N

           if i <= length(indices)
               f(indices[i])
           end
       end

julia> function _forindices_gpu(f, indices, backend::GPU; block_size::Int=256)
           blocks = (length(indices) + block_size - 1) ÷ block_size
           _forindices_global!(backend, block_size)(f, Val(indices); ndrange=(block_size * blocks,))
           nothing
       end
_forindices_gpu (generic function with 1 method)

julia> function test(X)
           _forindices_gpu(1:2:length(X), get_backend(X)) do i
               X[i] = 1
           end
       end
test (generic function with 1 method)

julia> X = CUDA.zeros(4)
4-element CuArray{Float32, 1, CUDA.DeviceMemory}:
 0.0
 0.0
 0.0
 0.0

julia> CUDA.@profile raw = true trace = true test(X)
Profiler ran for 196.81 ms, capturing 88 events.
...
Device-side activity: GPU was busy for 10.01 µs (0.01% of the trace)
┌──────┬───────────┬───────────┬────────────────────────────┬────────┬─────────┬────────┬──────┬─────────────────────────────────┬──────────────────────┬─────────┬──
│   ID │     Start │      Time │                     Device │ Stream │ Threads │ Blocks │ Regs │                      Shared Mem │            Local Mem │    Size │ ⋯
├──────┼───────────┼───────────┼────────────────────────────┼────────┼─────────┼────────┼──────┼─────────────────────────────────┼──────────────────────┼─────────┼──
│ 4379 │   2.39 ms │ 476.84 ns │ NVIDIA GeForce RTX 4070 Ti │     13 │       - │      - │    - │                               - │                    - │ 8 bytes │ ⋯
│ 4422 │ 196.76 ms │   9.54 µs │ NVIDIA GeForce RTX 4070 Ti │     13 │     256 │      1 │   16 │ 0 bytes static, 0 bytes dynamic │ 0 bytes / 69.375 MiB │       - │ ⋯
└──────┴───────────┴───────────┴────────────────────────────┴────────┴─────────┴────────┴──────┴─────────────────────────────────┴──────────────────────┴─────────┴──
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`indice` in _forindices_global! should be wrapped by Val. #30

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

indice in _forindices_global! should be wrapped by Val. #30

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`indice` in _forindices_global! should be wrapped by Val. #30