-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[Clang] Fix GPU intrinsic helpers incorrectly sign extending #129560
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: These return values are actually signed, meaning that casting will extend it and then all the bits will be one.
|
@llvm/pr-subscribers-backend-x86 Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/129560.diff 2 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 355e75d0b2d42..6ad8e54f4aadd 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -121,7 +121,7 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
+ ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
}
// Returns a bitmask of threads in the current lane for which \p x is true.
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 29d0adcabc82f..03594dd9bd6cb 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -131,7 +131,8 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
__gpu_num_lanes() - 1)
<< 32ull) |
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
- __gpu_num_lanes() - 1));
+ __gpu_num_lanes() - 1) &
+ 0xFFFFFFFF);
}
// Returns a bitmask of threads in the current lane for which \p x is true.
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/129560.diff 2 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 355e75d0b2d42..6ad8e54f4aadd 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -121,7 +121,7 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
+ ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
}
// Returns a bitmask of threads in the current lane for which \p x is true.
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 29d0adcabc82f..03594dd9bd6cb 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -131,7 +131,8 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
__gpu_num_lanes() - 1)
<< 32ull) |
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
- __gpu_num_lanes() - 1));
+ __gpu_num_lanes() - 1) &
+ 0xFFFFFFFF);
}
// Returns a bitmask of threads in the current lane for which \p x is true.
|
|
@llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/129560.diff 2 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 355e75d0b2d42..6ad8e54f4aadd 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -121,7 +121,7 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
+ ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
}
// Returns a bitmask of threads in the current lane for which \p x is true.
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 29d0adcabc82f..03594dd9bd6cb 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -131,7 +131,8 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
__gpu_num_lanes() - 1)
<< 32ull) |
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
- __gpu_num_lanes() - 1));
+ __gpu_num_lanes() - 1) &
+ 0xFFFFFFFF);
}
// Returns a bitmask of threads in the current lane for which \p x is true.
|
| uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); | ||
| return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) | | ||
| ((uint64_t)__builtin_amdgcn_readfirstlane(__lo)); | ||
| ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not just cast to uint32_t?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also the builtin really should have been unsigned to begin with
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are being combined, so they both should be 64-bit. I could cast to unsigned, then cast to unsigned 64-bit, but I think explicitly masking is clearer here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is also clamping the thing you just clamped 2 lines above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The clamp above likely isn't necessary, as a cast to unsigned will probably truncate, that's probably a no-op after optimizations. But these are different, the input is unsigned but the output is signed, so we need to clamp it again.
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should really fix the builtin. It could accept any legal type, but failing that it should just be uint32_t
Yeah it would simplify the code if we could just pass a pointer or whatever. We could probably also do some recognition in the compiler to stitch these kinds of manual splits back together into a single type. |
|
/cherry-pick 4ca8ea8 |
|
/pull-request #129587 |
Summary:
These return values are actually signed, meaning that casting will
extend it and then all the bits will be one.