@@ -34,7 +34,7 @@ Perform group reduction of `val` using `op`.
3434Result of the reduction.
3535"""
3636macro groupreduce (op, val, neutral, algo)
37- quote
37+ return quote
3838 __groupreduce (
3939 $ (esc (:__ctx__ )),
4040 $ (esc (op)),
@@ -47,7 +47,7 @@ macro groupreduce(op, val, neutral, algo)
4747end
4848
4949macro groupreduce (op, val, neutral, algo, groupsize)
50- quote
50+ return quote
5151 __groupreduce (
5252 $ (esc (:__ctx__ )),
5353 $ (esc (op)),
@@ -66,19 +66,19 @@ function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{
6666 @inbounds local_idx ≤ groupsize && (storage[local_idx] = val)
6767 @synchronize ()
6868
69- s:: UInt64 = groupsize ÷ 0x2
70- while s > 0x0
71- if (local_idx - 0x1 ) < s
69+ s:: UInt64 = groupsize ÷ 0x02
70+ while s > 0x00
71+ if (local_idx - 0x01 ) < s
7272 other_idx = local_idx + s
7373 if other_idx ≤ groupsize
7474 @inbounds storage[local_idx] = op (storage[local_idx], storage[other_idx])
7575 end
7676 end
7777 @synchronize ()
78- s >>= 0x1
78+ s >>= 0x01
7979 end
8080
81- if local_idx == 0x1
81+ if local_idx == 0x01
8282 @inbounds val = storage[local_idx]
8383 end
8484 return val
8787# Warp groupreduce.
8888
8989macro shfl_down (val, offset)
90- quote
90+ return quote
9191 $ __shfl_down ($ (esc (val)), $ (esc (offset)))
9292 end
9393end
@@ -97,10 +97,10 @@ function __shfl_down end
9797supports_warp_reduction (:: CPU ) = false
9898
9999@inline function __warp_reduce (val, op)
100- offset:: UInt32 = UInt32 (32 ) ÷ 0x2
101- while offset > 0x0
100+ offset:: UInt32 = UInt32 (32 ) ÷ 0x02
101+ while offset > 0x00
102102 val = op (val, @shfl_down (val, offset))
103- offset >>= 0x1
103+ offset >>= 0x01
104104 end
105105 return val
106106end
@@ -114,17 +114,17 @@ function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{
114114 storage = @localmem T __warp_bins
115115
116116 local_idx = @index (Local)
117- lane = (local_idx - 0x1 ) % __warpsize + 0x1
118- warp_id = (local_idx - 0x1 ) ÷ __warpsize + 0x1
117+ lane = (local_idx - 0x01 ) % __warpsize + 0x01
118+ warp_id = (local_idx - 0x01 ) ÷ __warpsize + 0x01
119119
120120 # Each warp performs a reduction and writes results into its own bin in `storage`.
121121 val = __warp_reduce (val, op)
122- @inbounds lane == 0x1 && (storage[warp_id] = val)
122+ @inbounds lane == 0x01 && (storage[warp_id] = val)
123123 @synchronize ()
124124
125125 # Final reduction of the `storage` on the first warp.
126- within_storage = (local_idx - 0x1 ) < groupsize ÷ __warpsize
126+ within_storage = (local_idx - 0x01 ) < groupsize ÷ __warpsize
127127 @inbounds val = within_storage ? storage[lane] : neutral
128- warp_id == 0x1 && (val = __warp_reduce (val, op))
128+ warp_id == 0x01 && (val = __warp_reduce (val, op))
129129 return val
130130end
0 commit comments