-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Allow forming overflow op and folding abd to usubo if it is legal #156266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag Author: AZero13 (AZero13) ChangesBecause usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible. Patch is 221.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156266.diff 46 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 438b6ff55c85f..1d8cea1a14c03 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase {
/// matching of other patterns.
virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool MathUsed) const {
+ // Form it if it is legal.
+ if (isOperationLegal(Opcode, VT))
+ return true;
+
// TODO: The default logic is inherited from code in CodeGenPrepare.
// The opcode should not make a difference by default?
if (Opcode != ISD::UADDO)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a8c7c16e2fa22..d13011019f3d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9788,7 +9788,8 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
// flag if the (scalar) type is illegal as this is more likely to legalize
// cleanly:
// abdu(lhs, rhs) -> sub(xor(sub(lhs, rhs), uof(lhs, rhs)), uof(lhs, rhs))
- if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT)) {
+ if (!IsSigned && (isOperationLegal(ISD::USUBO, VT) ||
+ (VT.isScalarInteger() && !isTypeLegal(VT)))) {
SDValue USubO =
DAG.getNode(ISD::USUBO, dl, DAG.getVTList(VT, MVT::i1), {LHS, RHS});
SDValue Cmp = DAG.getNode(ISD::SIGN_EXTEND, dl, VT, USubO.getValue(1));
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 42c7b90da63d3..28f55511ebb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
;.
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 7e9cb7adf4fc2..181dab8d4ca79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -105,7 +105,7 @@ declare void @unknown()
define amdgpu_kernel void @kernel_calls_extern() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: ret void
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
; CHECK-NEXT: ret void
;
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
; CHECK-NEXT: call void [[INDIRECT]]()
; CHECK-NEXT: ret void
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]]
; CHECK-NEXT: ret void
;
call void %indirect() #0
@@ -254,11 +254,12 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
index 03c84d1193609..f8a38572c1544 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
@@ -6,7 +6,7 @@
define amdgpu_kernel void @noop_sqrt_fpmath(ptr addrspace(1) %out, float %x) #0 {
; CHECK-LABEL: define amdgpu_kernel void @noop_sqrt_fpmath
; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0:![0-9]+]]
; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
@@ -20,9 +20,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2:![0-9]+]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -51,7 +51,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -88,9 +88,9 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -149,7 +149,7 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -206,7 +206,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, fl
; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -243,9 +243,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -311,9 +311,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -342,7 +342,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -379,9 +379,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -410,7 +410,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile floa...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: AZero13 (AZero13) ChangesBecause usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible. Patch is 221.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156266.diff 46 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 438b6ff55c85f..1d8cea1a14c03 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase {
/// matching of other patterns.
virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool MathUsed) const {
+ // Form it if it is legal.
+ if (isOperationLegal(Opcode, VT))
+ return true;
+
// TODO: The default logic is inherited from code in CodeGenPrepare.
// The opcode should not make a difference by default?
if (Opcode != ISD::UADDO)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a8c7c16e2fa22..d13011019f3d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9788,7 +9788,8 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
// flag if the (scalar) type is illegal as this is more likely to legalize
// cleanly:
// abdu(lhs, rhs) -> sub(xor(sub(lhs, rhs), uof(lhs, rhs)), uof(lhs, rhs))
- if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT)) {
+ if (!IsSigned && (isOperationLegal(ISD::USUBO, VT) ||
+ (VT.isScalarInteger() && !isTypeLegal(VT)))) {
SDValue USubO =
DAG.getNode(ISD::USUBO, dl, DAG.getVTList(VT, MVT::i1), {LHS, RHS});
SDValue Cmp = DAG.getNode(ISD::SIGN_EXTEND, dl, VT, USubO.getValue(1));
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 42c7b90da63d3..28f55511ebb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
;.
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 7e9cb7adf4fc2..181dab8d4ca79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -105,7 +105,7 @@ declare void @unknown()
define amdgpu_kernel void @kernel_calls_extern() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: ret void
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
; CHECK-NEXT: ret void
;
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
; CHECK-NEXT: call void [[INDIRECT]]()
; CHECK-NEXT: ret void
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]]
; CHECK-NEXT: ret void
;
call void %indirect() #0
@@ -254,11 +254,12 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
index 03c84d1193609..f8a38572c1544 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
@@ -6,7 +6,7 @@
define amdgpu_kernel void @noop_sqrt_fpmath(ptr addrspace(1) %out, float %x) #0 {
; CHECK-LABEL: define amdgpu_kernel void @noop_sqrt_fpmath
; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META0:![0-9]+]]
; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
@@ -20,9 +20,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2:![0-9]+]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -51,7 +51,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -88,9 +88,9 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -149,7 +149,7 @@ define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float>
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
@@ -206,7 +206,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, fl
; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1:![0-9]+]]
; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -243,9 +243,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out,
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -311,9 +311,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -342,7 +342,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1)
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
@@ -379,9 +379,9 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; IEEE-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; IEEE-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; IEEE-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; IEEE-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META2]]
; IEEE-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-NEXT: [[TMP1:%.*]] = fcmp olt float [[X]], 0x3810000000000000
; IEEE-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 0
@@ -410,7 +410,7 @@ define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrsp
; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
; DAZ-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
; DAZ-NEXT: store volatile floa...
[truncated]
|
41107ad
to
ed1f5c6
Compare
…legal. Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.
Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.