release/20.x: [Clang] Fix cross-lane scan when given divergent lanes (#127703) #128085

llvmbot · 2025-02-20T23:11:27Z

Backport 6cc7ca0

Requested by: @jhuber6

llvmbot · 2025-02-20T23:11:32Z

@shiltian What do you think about merging this PR to the release branch?

llvmbot · 2025-02-20T23:11:59Z

@llvm/pr-subscribers-backend-x86

Author: None (llvmbot)

Changes

Backport 6cc7ca0

Requested by: @jhuber6

Full diff: https://github.com/llvm/llvm-project/pull/128085.diff

3 Files Affected:

(modified) clang/lib/Headers/gpuintrin.h (+49-25)
(modified) clang/lib/Headers/nvptxintrin.h (+4-1)
(modified) libc/test/integration/src/__support/GPU/scan_reduce.cpp (+49)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 11c87e85cd497..efdc3d94ac0b3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix)                                        \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
-      uint64_t __lane_mask, __type __x) {                                      \
-    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
-      uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
-                                          __gpu_num_lanes());                  \
-    }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
-  }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
-
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
       uint64_t __lane_mask, uint32_t __x) {                                    \
-    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __index = __gpu_lane_id() - __step;                             \
-      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
-      __x += __builtin_bit_cast(                                               \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, __index, __x, \
-                                                    __gpu_num_lanes())));      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      __type __accum = 0;                                                      \
+      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
+        __type __index = __builtin_ctzll(__mask);                              \
+        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+                                                    __gpu_num_lanes());        \
+        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
+        __accum += __tmp;                                                      \
+      }                                                                        \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __gpu_lane_id() - __step;                           \
+        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
+        __x += __builtin_bit_cast(                                             \
+            __type,                                                            \
+            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
+                                          __gpu_shuffle_idx_##__suffix(        \
+                                              __lane_mask, __index, __x,       \
+                                              __gpu_num_lanes())));            \
+      }                                                                        \
     }                                                                          \
     return __x;                                                                \
   }
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
 
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      return __gpu_shuffle_idx_##__suffix(                                     \
+          __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
+          __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __step + __gpu_lane_id();                           \
+        __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
+                                            __gpu_num_lanes());                \
+      }                                                                        \
+      return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
+    }                                                                          \
+  }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..0afcb1c5ff0f0 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -151,8 +151,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
                       uint32_t __width) {
+  // Mask out inactive lanes to match AMDGPU behavior.
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+  bool __bitmask = (1ull << __idx) & __lane_mask;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
                                   ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index bc621c3300cbe..1d50e1f99bf31 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -53,10 +53,59 @@ static void test_scan() {
   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
 }
 
+static uint32_t random(uint64_t *rand_next) {
+  uint64_t x = *rand_next;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  *rand_next = x;
+  return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
+}
+
+// Scan operations can break down under thread divergence, make sure that the
+// function works under some random divergence. We do this by trivially
+// implementing a scan with shared scratch memory and then comparing the
+// results.
+static void test_scan_divergent() {
+  static uint32_t input[64] = {0};
+  static uint32_t result[64] = {0};
+  uint64_t state = gpu::processor_clock() + __gpu_lane_id();
+
+  for (int i = 0; i < 64; ++i) {
+    uint64_t lanemask = gpu::get_lane_mask();
+    if (random(&state) & (1ull << gpu::get_lane_id())) {
+      uint64_t divergent = gpu::get_lane_mask();
+      uint32_t value = random(&state) % 256;
+      input[gpu::get_lane_id()] = value;
+
+      if (gpu::is_first_lane(divergent)) {
+        uint32_t accumulator = 0;
+        for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
+          uint32_t tmp = input[lane];
+          result[lane] = tmp + accumulator;
+          accumulator += tmp;
+        }
+      }
+      gpu::sync_lane(divergent);
+
+      uint32_t scan = gpu::scan(divergent, value);
+      EXPECT_EQ(scan, result[gpu::get_lane_id()]);
+    }
+    if (gpu::is_first_lane(lanemask))
+      __builtin_memset(input, 0, sizeof(input));
+    gpu::sync_lane(lanemask);
+  }
+}
+
 TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
   test_reduce();
 
   test_scan();
 
+  test_scan_divergent();
+
   return 0;
 }

llvmbot · 2025-02-20T23:12:00Z

@llvm/pr-subscribers-libc

Author: None (llvmbot)

Changes

Backport 6cc7ca0

Requested by: @jhuber6

Full diff: https://github.com/llvm/llvm-project/pull/128085.diff

3 Files Affected:

(modified) clang/lib/Headers/gpuintrin.h (+49-25)
(modified) clang/lib/Headers/nvptxintrin.h (+4-1)
(modified) libc/test/integration/src/__support/GPU/scan_reduce.cpp (+49)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 11c87e85cd497..efdc3d94ac0b3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix)                                        \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
-      uint64_t __lane_mask, __type __x) {                                      \
-    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
-      uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
-                                          __gpu_num_lanes());                  \
-    }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
-  }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
-
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
       uint64_t __lane_mask, uint32_t __x) {                                    \
-    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __index = __gpu_lane_id() - __step;                             \
-      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
-      __x += __builtin_bit_cast(                                               \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, __index, __x, \
-                                                    __gpu_num_lanes())));      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      __type __accum = 0;                                                      \
+      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
+        __type __index = __builtin_ctzll(__mask);                              \
+        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+                                                    __gpu_num_lanes());        \
+        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
+        __accum += __tmp;                                                      \
+      }                                                                        \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __gpu_lane_id() - __step;                           \
+        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
+        __x += __builtin_bit_cast(                                             \
+            __type,                                                            \
+            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
+                                          __gpu_shuffle_idx_##__suffix(        \
+                                              __lane_mask, __index, __x,       \
+                                              __gpu_num_lanes())));            \
+      }                                                                        \
     }                                                                          \
     return __x;                                                                \
   }
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
 
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      return __gpu_shuffle_idx_##__suffix(                                     \
+          __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
+          __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __step + __gpu_lane_id();                           \
+        __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
+                                            __gpu_num_lanes());                \
+      }                                                                        \
+      return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
+    }                                                                          \
+  }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..0afcb1c5ff0f0 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -151,8 +151,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
                       uint32_t __width) {
+  // Mask out inactive lanes to match AMDGPU behavior.
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+  bool __bitmask = (1ull << __idx) & __lane_mask;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
                                   ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index bc621c3300cbe..1d50e1f99bf31 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -53,10 +53,59 @@ static void test_scan() {
   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
 }
 
+static uint32_t random(uint64_t *rand_next) {
+  uint64_t x = *rand_next;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  *rand_next = x;
+  return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
+}
+
+// Scan operations can break down under thread divergence, make sure that the
+// function works under some random divergence. We do this by trivially
+// implementing a scan with shared scratch memory and then comparing the
+// results.
+static void test_scan_divergent() {
+  static uint32_t input[64] = {0};
+  static uint32_t result[64] = {0};
+  uint64_t state = gpu::processor_clock() + __gpu_lane_id();
+
+  for (int i = 0; i < 64; ++i) {
+    uint64_t lanemask = gpu::get_lane_mask();
+    if (random(&state) & (1ull << gpu::get_lane_id())) {
+      uint64_t divergent = gpu::get_lane_mask();
+      uint32_t value = random(&state) % 256;
+      input[gpu::get_lane_id()] = value;
+
+      if (gpu::is_first_lane(divergent)) {
+        uint32_t accumulator = 0;
+        for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
+          uint32_t tmp = input[lane];
+          result[lane] = tmp + accumulator;
+          accumulator += tmp;
+        }
+      }
+      gpu::sync_lane(divergent);
+
+      uint32_t scan = gpu::scan(divergent, value);
+      EXPECT_EQ(scan, result[gpu::get_lane_id()]);
+    }
+    if (gpu::is_first_lane(lanemask))
+      __builtin_memset(input, 0, sizeof(input));
+    gpu::sync_lane(lanemask);
+  }
+}
+
 TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
   test_reduce();
 
   test_scan();
 
+  test_scan_divergent();
+
   return 0;
 }

llvmbot · 2025-02-20T23:12:00Z

@llvm/pr-subscribers-clang

Author: None (llvmbot)

Changes

Backport 6cc7ca0

Requested by: @jhuber6

Full diff: https://github.com/llvm/llvm-project/pull/128085.diff

3 Files Affected:

(modified) clang/lib/Headers/gpuintrin.h (+49-25)
(modified) clang/lib/Headers/nvptxintrin.h (+4-1)
(modified) libc/test/integration/src/__support/GPU/scan_reduce.cpp (+49)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 11c87e85cd497..efdc3d94ac0b3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix)                                        \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
-      uint64_t __lane_mask, __type __x) {                                      \
-    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
-      uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
-                                          __gpu_num_lanes());                  \
-    }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
-  }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
-
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
       uint64_t __lane_mask, uint32_t __x) {                                    \
-    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __index = __gpu_lane_id() - __step;                             \
-      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
-      __x += __builtin_bit_cast(                                               \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, __index, __x, \
-                                                    __gpu_num_lanes())));      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      __type __accum = 0;                                                      \
+      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
+        __type __index = __builtin_ctzll(__mask);                              \
+        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+                                                    __gpu_num_lanes());        \
+        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
+        __accum += __tmp;                                                      \
+      }                                                                        \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __gpu_lane_id() - __step;                           \
+        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
+        __x += __builtin_bit_cast(                                             \
+            __type,                                                            \
+            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
+                                          __gpu_shuffle_idx_##__suffix(        \
+                                              __lane_mask, __index, __x,       \
+                                              __gpu_num_lanes())));            \
+      }                                                                        \
     }                                                                          \
     return __x;                                                                \
   }
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
 
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      return __gpu_shuffle_idx_##__suffix(                                     \
+          __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
+          __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __step + __gpu_lane_id();                           \
+        __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
+                                            __gpu_num_lanes());                \
+      }                                                                        \
+      return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
+    }                                                                          \
+  }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..0afcb1c5ff0f0 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -151,8 +151,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
                       uint32_t __width) {
+  // Mask out inactive lanes to match AMDGPU behavior.
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+  bool __bitmask = (1ull << __idx) & __lane_mask;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
                                   ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index bc621c3300cbe..1d50e1f99bf31 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -53,10 +53,59 @@ static void test_scan() {
   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
 }
 
+static uint32_t random(uint64_t *rand_next) {
+  uint64_t x = *rand_next;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  *rand_next = x;
+  return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
+}
+
+// Scan operations can break down under thread divergence, make sure that the
+// function works under some random divergence. We do this by trivially
+// implementing a scan with shared scratch memory and then comparing the
+// results.
+static void test_scan_divergent() {
+  static uint32_t input[64] = {0};
+  static uint32_t result[64] = {0};
+  uint64_t state = gpu::processor_clock() + __gpu_lane_id();
+
+  for (int i = 0; i < 64; ++i) {
+    uint64_t lanemask = gpu::get_lane_mask();
+    if (random(&state) & (1ull << gpu::get_lane_id())) {
+      uint64_t divergent = gpu::get_lane_mask();
+      uint32_t value = random(&state) % 256;
+      input[gpu::get_lane_id()] = value;
+
+      if (gpu::is_first_lane(divergent)) {
+        uint32_t accumulator = 0;
+        for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
+          uint32_t tmp = input[lane];
+          result[lane] = tmp + accumulator;
+          accumulator += tmp;
+        }
+      }
+      gpu::sync_lane(divergent);
+
+      uint32_t scan = gpu::scan(divergent, value);
+      EXPECT_EQ(scan, result[gpu::get_lane_id()]);
+    }
+    if (gpu::is_first_lane(lanemask))
+      __builtin_memset(input, 0, sizeof(input));
+    gpu::sync_lane(lanemask);
+  }
+}
+
 TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
   test_reduce();
 
   test_scan();
 
+  test_scan_divergent();
+
   return 0;
 }

shiltian · 2025-02-20T23:15:32Z

LGTM

Summary: The scan operation implemented here only works if there are contiguous ones in the executation mask that can be used to propagate the result. There are two solutions to this, one is to enter 'whole-wave-mode' and forcibly turn them back on, or to do this serially. This implementation does the latter because it's more portable, but checks to see if the parallel fast-path is applicable. Needs to be backported for correct behavior and because it fixes a failing libc test. (cherry picked from commit 6cc7ca0)

github-actions · 2025-02-21T22:11:14Z

@jhuber6 (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR.

llvmbot added this to the LLVM 20.X Release milestone Feb 20, 2025

llvmbot requested a review from shiltian February 20, 2025 23:11

llvmbot mentioned this pull request Feb 20, 2025

[Clang] Fix cross-lane scan when given divergent lanes #127703

Merged

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics libc labels Feb 20, 2025

shiltian approved these changes Feb 21, 2025

View reviewed changes

llvmbot force-pushed the issue127703 branch from 41c0aae to e0c4a33 Compare February 21, 2025 22:10

tstellar merged commit e0c4a33 into llvm:release/20.x Feb 21, 2025
7 of 10 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

release/20.x: [Clang] Fix cross-lane scan when given divergent lanes (#127703) #128085

release/20.x: [Clang] Fix cross-lane scan when given divergent lanes (#127703) #128085

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

shiltian commented Feb 20, 2025

Uh oh!

Uh oh!

github-actions bot commented Feb 21, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

release/20.x: [Clang] Fix cross-lane scan when given divergent lanes (#127703) #128085

release/20.x: [Clang] Fix cross-lane scan when given divergent lanes (#127703) #128085

Uh oh!

Conversation

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

llvmbot commented Feb 20, 2025

Uh oh!

shiltian commented Feb 20, 2025

Uh oh!

Uh oh!

github-actions bot commented Feb 21, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants