From 3030189e247592c9d893af0aa8681656b5269226 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 6 Jun 2024 10:45:12 -0700 Subject: [PATCH 01/13] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in AMDGPUAttributor The AMDGPUAnnotateKernelFeatures pass infers the "amdgpu-calls" and "amdgpu-stack-objects" attributes, which are used to infer whether we need to initialize flat scratch. This is, however, not precise. Instead, we should use AMDGPUAttributor and infer amdgpu-no-flat-scratch-init on kernels. Refer to https://github.com/llvm/llvm-project/issues/63586 . --- llvm/lib/Target/AMDGPU/AMDGPUAttributes.def | 1 + llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 43 + ...licit-kernarg-backend-usage-global-isel.ll | 30 +- .../AMDGPU/addrspacecast-constantexpr.ll | 6 +- .../AMDGPU/amdgpu-attributor-no-agpr.ll | 21 +- .../annotate-kernel-features-hsa-call.ll | 112 +- .../AMDGPU/annotate-kernel-features-hsa.ll | 44 +- .../AMDGPU/annotate-kernel-features.ll | 18 +- .../attributor-flatscratchinit-globalisel.ll | 1028 ++++++++++++ .../AMDGPU/attributor-flatscratchinit.ll | 914 +++++++++++ llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 2 +- .../AMDGPU/call-graph-register-usage.ll | 12 +- .../callee-special-input-sgprs-fixed-abi.ll | 36 +- .../CodeGen/AMDGPU/direct-indirect-call.ll | 4 +- .../AMDGPU/duplicate-attribute-indirect.ll | 4 +- .../test/CodeGen/AMDGPU/flat-address-space.ll | 8 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 31 +- .../AMDGPU/implicitarg-offset-attributes.ll | 30 +- llvm/test/CodeGen/AMDGPU/ipra.ll | 2 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 248 +-- .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 29 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 15 +- .../AMDGPU/lower-module-lds-via-table.ll | 15 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 1380 +++++++++++++++++ .../memory-legalizer-flat-nontemporal.ll | 75 + .../memory-legalizer-flat-singlethread.ll | 1380 +++++++++++++++++ .../AMDGPU/memory-legalizer-flat-system.ll | 1380 +++++++++++++++++ .../AMDGPU/memory-legalizer-flat-volatile.ll | 66 + .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++ .../AMDGPU/memory-legalizer-global-agent.ll | 273 ++++ .../memory-legalizer-global-nontemporal.ll | 15 + .../memory-legalizer-global-singlethread.ll | 276 ++++ .../AMDGPU/memory-legalizer-global-system.ll | 261 ++++ .../memory-legalizer-global-volatile.ll | 18 + .../memory-legalizer-global-wavefront.ll | 276 ++++ .../memory-legalizer-global-workgroup.ll | 276 ++++ .../memory-legalizer-local-nontemporal.ll | 9 + .../AMDGPU/memory-legalizer-local-volatile.ll | 6 + .../memory-legalizer-private-nontemporal.ll | 59 +- .../memory-legalizer-private-volatile.ll | 30 +- .../AMDGPU/propagate-flat-work-group-size.ll | 18 +- .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 44 +- .../AMDGPU/recursive_global_initializer.ll | 2 +- .../AMDGPU/remove-no-kernel-id-attribute.ll | 8 +- .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 2 +- .../CodeGen/AMDGPU/simple-indirect-call.ll | 4 +- .../uniform-work-group-attribute-missing.ll | 4 +- .../AMDGPU/uniform-work-group-multistep.ll | 4 +- ...niform-work-group-nested-function-calls.ll | 4 +- ...ork-group-prevent-attribute-propagation.ll | 6 +- .../uniform-work-group-propagate-attribute.ll | 4 +- .../uniform-work-group-recursion-test.ll | 6 +- .../CodeGen/AMDGPU/uniform-work-group-test.ll | 4 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 2 +- 55 files changed, 10760 insertions(+), 470 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def index bacc8e4e821e5..8c1c8219690ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def @@ -30,5 +30,6 @@ AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z") AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id") AMDGPU_ATTRIBUTE(DEFAULT_QUEUE, "amdgpu-no-default-queue") AMDGPU_ATTRIBUTE(COMPLETION_ACTION, "amdgpu-no-completion-action") +AMDGPU_ATTRIBUTE(FLAT_SCRATCH_INIT, "amdgpu-no-flat-scratch-init") #undef AMDGPU_ATTRIBUTE diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 687a7339da379..76eba1aa9ffcd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -439,6 +439,19 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { indicatePessimisticFixpoint(); return; } + + bool HasAllocaOrASCast = false; + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + if (isa(I) || isa(I)) { + HasAllocaOrASCast = true; + removeAssumedBits(FLAT_SCRATCH_INIT); + break; + } + } + if (HasAllocaOrASCast) + break; + } } ChangeStatus updateImpl(Attributor &A) override { @@ -525,6 +538,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV)) removeAssumedBits(COMPLETION_ACTION); + if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A)) + removeAssumedBits(FLAT_SCRATCH_INIT); + return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } @@ -683,6 +699,33 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, UsedAssumedInformation); } + + // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is + // not to be set. + bool needFlatScratchInit(Attributor &A) { + // This is called on each callee; false means callee shouldn't have + // no-flat-scratch-init. + auto CheckForNoFlatScratchInit = [&](Instruction &I) { + const auto &CB = cast(I); + const Value *CalleeOp = CB.getCalledOperand(); + const Function *Callee = dyn_cast(CalleeOp); + if (!Callee) // indirect call + return CB.isInlineAsm(); + + if (Callee->isIntrinsic()) + return true; + + const auto *CalleeInfo = A.getAAFor( + *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); + return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT); + }; + + bool UsedAssumedInformation = false; + // If any callee is false (i.e. need FlatScratchInit), + // checkForAllCallLikeInstructions returns false + return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this, + UsedAssumedInformation); + } }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 9443b39dcdc03..80e9ae33d6d45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -10,9 +10,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 -; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 +; GFX8V4-NEXT: s_add_i32 s8, s8, s11 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s9 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 ; GFX8V4-NEXT: s_mov_b32 s5, s3 @@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V4-NEXT: flat_store_dword v[0:1], v2 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) @@ -35,9 +38,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 -; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 +; GFX8V5-NEXT: s_add_i32 s6, s6, s9 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 @@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) @@ -59,10 +65,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s2, s0 ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -83,10 +91,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s2, s0 ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index cff9ce0506679..96bbcb7ed2149 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -233,9 +233,9 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll index e5d440b96349f..05ee8cabb5e7c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -116,7 +116,7 @@ define amdgpu_kernel void @kernel_calls_extern() { define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite( ; CHECK-SAME: ) #[[ATTR4]] { -; CHECK-NEXT: call void @unknown() #[[ATTR9:[0-9]+]] +; CHECK-NEXT: call void @unknown() #[[ATTR10:[0-9]+]] ; CHECK-NEXT: ret void ; call void @unknown() #0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite( ; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { -; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR9]] +; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR10]] ; CHECK-NEXT: ret void ; call void %indirect() #0 @@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) { define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr( -; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] { ; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty ; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]] @@ -254,14 +254,15 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { attributes #0 = { "amdgpu-no-agpr" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 3d4ae84d9c698..23294eacbe6cb 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 -; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -645,7 +645,7 @@ define internal void @defined.func() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -658,7 +658,7 @@ define void @func_call_external() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -673,7 +673,7 @@ define void @func_call_defined() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -687,8 +687,8 @@ define void @func_call_asm() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { -; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] { +; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void asm sideeffect "", ""() #3 @@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -788,7 +788,7 @@ define float @func_extern_call() #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call -; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -956,7 +956,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR30:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -969,7 +969,7 @@ define void @unused_enqueue_block() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -980,7 +980,7 @@ define internal void @known_func() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -994,8 +994,8 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] { -; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR29:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] { +; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR32:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void @known_func() #6 @@ -1024,36 +1024,40 @@ attributes #6 = { "enqueued-block" } ; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" } ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. + ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind } -; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind } +; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 43cdf85ed3818..e30e013e3e3d8 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -454,7 +454,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) ; HSA-NEXT: store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -466,7 +466,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 { define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -503,7 +503,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) % define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) ; HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -515,7 +515,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 { define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4) ; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -534,7 +534,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared -; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) ; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 ; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 @@ -555,7 +555,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private -; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) ; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 ; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 @@ -575,7 +575,7 @@ define amdgpu_kernel void @use_alloca() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca -; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -596,7 +596,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] { ; ATTRIBUTOR_HSA-NEXT: entry: ; ATTRIBUTOR_HSA-NEXT: br label [[BB:%.*]] ; ATTRIBUTOR_HSA: bb: @@ -621,7 +621,7 @@ define void @use_alloca_func() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR13:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -643,19 +643,21 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 547ff69592ca0..89fe46d975309 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -398,13 +398,13 @@ attributes #1 = { nounwind } ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll new file mode 100644 index 0000000000000..ce5a3eedb5ebb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -0,0 +1,1028 @@ +; Test the generation of the attribute amdgpu-no-flat-scratch-init +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s + +;; tests of alloca + +define void @without_alloca(i1 %arg0) { + store volatile i1 %arg0, ptr addrspace(1) undef + ret void +} + +define void @with_alloca() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { + store volatile i1 %arg0, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @with_alloca_cc_kernel() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +; graphics functions won't get the attribute amdgpu-no-flat-scratch-init + +define amdgpu_vs void @with_alloca_cc_vs() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_gs void @with_alloca_cc_gs() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_ps void @with_alloca_cc_ps() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_cs void @with_alloca_cc_cs() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_hs void @with_alloca_cc_hs() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_ls void @with_alloca_cc_ls() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_es void @with_alloca_cc_es() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_gfx void @with_alloca_cc_gfx() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_cs_chain void @with_alloca_cc_cs_chain() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define void @call_without_alloca() { + call void @without_alloca(i1 true) + ret void +} + +define amdgpu_kernel void @call_without_alloca_cc_kernel() { + call void @without_alloca(i1 true) + ret void +} + +define void @call_with_alloca() { + call void @with_alloca() + ret void +} + +define amdgpu_kernel void @call_with_alloca_cc_kernel() { + call void @with_alloca() + ret void +} + +define void @call_both_with_and_without_alloca() { + call void @with_alloca() + call void @without_alloca() + ret void +} + +define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { + call void @with_alloca() + call void @without_alloca() + ret void +} + +define void @call_call_without_alloca() { + call void @call_without_alloca() + ret void +} + +define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { + call void @call_without_alloca() + ret void +} + +define void @call_call_with_alloca() { + call void @call_with_alloca() + ret void +} + +define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { + call void @call_with_alloca() + ret void +} + +define void @with_alloca_call_without_alloca() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @without_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @without_alloca() + ret void +} + +define void @with_alloca_call_with_alloca() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @with_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @with_alloca() + ret void +} + +define void @with_alloca_call_call_without_alloca() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_without_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_without_alloca() + ret void +} + +define void @with_alloca_call_call_with_alloca() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_with_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_with_alloca() + ret void +} + +;; tests of addrspacecast + +define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of mixed alloca and addrspacecast + +define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) { + call void @without_alloca(i1 true) + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @without_alloca(i1 true) + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { + call void @without_alloca(i1 true) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @without_alloca(i1 true) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of indirect call, intrinsics + +@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 + +define void @with_indirect_call() { + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define amdgpu_kernel void @with_indirect_call_cc_kernel() { + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define void @call_with_indirect_call() { + call void @with_indirect_call() + ret void +} + +define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { + call void @with_indirect_call() + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() + +define void @use_intrinsic_workitem_id_x() { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) undef + ret void +} + +define void @call_use_intrinsic_workitem_id_x() { + call void @use_intrinsic_workitem_id_x() + ret void +} + +define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { + call void @use_intrinsic_workitem_id_x() + ret void +} + +; GFX10: name: without_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_alloca_cc_vs +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; +; GFX10: name: with_alloca_cc_gs +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr5' } +; +; GFX10: name: with_alloca_cc_ps +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; +; GFX10: name: with_alloca_cc_cs +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; +; GFX10: name: with_alloca_cc_hs +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr5' } +; +; GFX10: name: with_alloca_cc_ls +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; +; GFX10: name: with_alloca_cc_es +; GFX10: argumentInfo: +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; +; GFX10: name: with_alloca_cc_gfx +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: psInputAddr: 0 +; +; GFX10: name: with_alloca_cc_cs_chain +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' } +; GFX10-NEXT: psInputAddr: 0 +; +; GFX10: name: with_alloca_cc_cs_chain_preserve +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' } +; GFX10-NEXT: psInputAddr: 0 +; +; GFX10: name: call_without_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_without_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } +; +; GFX10: name: call_with_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_with_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_both_with_and_without_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_both_with_and_without_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_call_without_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_without_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } +; +; GFX10: name: call_call_with_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_with_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_alloca_call_without_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_alloca_call_without_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_alloca_call_with_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_alloca_call_with_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_alloca_call_call_without_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_alloca_call_call_without_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_alloca_call_call_with_alloca +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_alloca_call_call_with_alloca_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: without_global_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_global_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_global_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_global_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: without_region_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_region_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_region_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_region_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: without_group_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_group_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_group_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_group_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: without_constant_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_constant_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_constant_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_constant_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: call_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: call_call_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_call_with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: call_without_alloca_and_without_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_without_alloca_and_without_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_without_alloca_and_with_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_without_alloca_and_with_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; +; GFX10: name: with_indirect_call +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_indirect_call_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr8_sgpr9' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_with_indirect_call +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_with_indirect_call_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr8_sgpr9' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: use_intrinsic_workitem_id_x +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: use_intrinsic_workitem_id_x_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } +; +; GFX10: name: call_use_intrinsic_workitem_id_x +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_use_intrinsic_workitem_id_x_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll new file mode 100644 index 0000000000000..c0d700cc37464 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -0,0 +1,914 @@ +; Test the generation of the attribute amdgpu-no-flat-scratch-init +; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s + +;; tests of alloca + +define void @without_alloca(i1 %arg0) { +; GFX9-LABEL: define void @without_alloca(i1 %arg0) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @without_alloca(i1 %arg0) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + store volatile i1 %arg0, ptr addrspace(1) undef + ret void +} + +define void @with_alloca() { +; GFX9-LABEL: define void @with_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { +; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + store volatile i1 %arg0, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @with_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +; graphics functions won't get the attribute amdgpu-no-flat-scratch-init + +define amdgpu_vs void @with_alloca_cc_vs() { +; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_gs void @with_alloca_cc_gs() { +; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_ps void @with_alloca_cc_ps() { +; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_cs void @with_alloca_cc_cs() { +; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_hs void @with_alloca_cc_hs() { +; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_ls void @with_alloca_cc_ls() { +; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_es void @with_alloca_cc_es() { +; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_gfx void @with_alloca_cc_gfx() { +; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_cs_chain void @with_alloca_cc_cs_chain() { +; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { +; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() +; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() +; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + ret void +} + +define void @call_without_alloca() { +; GFX9-LABEL: define void @call_without_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_without_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + call void @without_alloca(i1 true) + ret void +} + +define amdgpu_kernel void @call_without_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + call void @without_alloca(i1 true) + ret void +} + +define void @call_with_alloca() { +; GFX9-LABEL: define void @call_with_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_with_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @with_alloca() + ret void +} + +define amdgpu_kernel void @call_with_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @with_alloca() + ret void +} + +define void @call_both_with_and_without_alloca() { +; GFX9-LABEL: define void @call_both_with_and_without_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_both_with_and_without_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @with_alloca() + call void @without_alloca() + ret void +} + +define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @with_alloca() + call void @without_alloca() + ret void +} + +define void @call_call_without_alloca() { +; GFX9-LABEL: define void @call_call_without_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_call_without_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + call void @call_without_alloca() + ret void +} + +define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] + call void @call_without_alloca() + ret void +} + +define void @call_call_with_alloca() { +; GFX9-LABEL: define void @call_call_with_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_call_with_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @call_with_alloca() + ret void +} + +define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @call_with_alloca() + ret void +} + +define void @with_alloca_call_without_alloca() { +; GFX9-LABEL: define void @with_alloca_call_without_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_alloca_call_without_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @without_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @without_alloca() + ret void +} + +define void @with_alloca_call_with_alloca() { +; GFX9-LABEL: define void @with_alloca_call_with_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_alloca_call_with_alloca() +; GFX10-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @with_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @with_alloca() + ret void +} + +define void @with_alloca_call_call_without_alloca() { +; GFX9-LABEL: define void @with_alloca_call_call_without_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_alloca_call_call_without_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_without_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_without_alloca() + ret void +} + +define void @with_alloca_call_call_with_alloca() { +; GFX9-LABEL: define void @with_alloca_call_call_with_alloca() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_alloca_call_call_with_alloca() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_with_alloca() + ret void +} + +define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %temp = alloca i32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 4 + call void @call_with_alloca() + ret void +} + +;; tests of addrspacecast + +define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of mixed alloca and addrspacecast + +define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] + call void @without_alloca(i1 true) + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + call void @without_alloca(i1 true) + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; +; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] + call void @without_alloca(i1 true) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] + call void @without_alloca(i1 true) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of indirect call, intrinsics + +@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 + +define void @with_indirect_call() { +; GFX9-LABEL: define void @with_indirect_call() +; GFX9-SAME: #[[ATTR_GFX9_IND_CALL:[0-9]+]] +; +; GFX10-LABEL: define void @with_indirect_call() +; GFX10-SAME: #[[ATTR_GFX10_IND_CALL:[0-9]+]] { + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define amdgpu_kernel void @with_indirect_call_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_IND_CALL2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_IND_CALL2:[0-9]+]] + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define void @call_with_indirect_call() { +; GFX9-LABEL: define void @call_with_indirect_call() +; GFX9-SAME: #[[ATTR_GFX9_IND_CALL:[0-9]+]] +; +; GFX10-LABEL: define void @call_with_indirect_call() +; GFX10-SAME: #[[ATTR_GFX10_IND_CALL:[0-9]+]] + call void @with_indirect_call() + ret void +} + +define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_IND_CALL2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_IND_CALL2:[0-9]+]] + call void @with_indirect_call() + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() + +define void @use_intrinsic_workitem_id_x() { +; GFX9-LABEL: define void @use_intrinsic_workitem_id_x() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; +; GFX10-LABEL: define void @use_intrinsic_workitem_id_x() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) undef + ret void +} + +define void @call_use_intrinsic_workitem_id_x() { +; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; +; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] + call void @use_intrinsic_workitem_id_x() + ret void +} + +define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] + call void @use_intrinsic_workitem_id_x() + ret void +} + + +; GFX9: attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +;GFX9: attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 90562e25a3e9c..470c444ad8cd7 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -23,7 +23,7 @@ ; COV5: .amdhsa_user_sgpr_queue_ptr 0 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1 -; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0 +; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index dbd00f09943c0..0d66ea55a0437 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: -; CI: ; TotalNumSgprs: 38 -; VI-NOBUG: ; TotalNumSgprs: 40 -; VI-BUG: ; TotalNumSgprs: 96 +; CI: ; NumSgprs: 36 +; VI-NOBUG: ; NumSgprs: 36 +; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() @@ -121,9 +121,9 @@ define void @indirect_use_80_sgpr() #1 { } ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: -; CI: ; TotalNumSgprs: 84 -; VI-NOBUG: ; TotalNumSgprs: 86 -; VI-BUG: ; TotalNumSgprs: 96 +; CI: ; NumSgprs: 82 +; VI-NOBUG: ; NumSgprs: 82 +; VI-BUG: ; NumSgprs: 96 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 8ef2d89e76d4e..25a5207d3bbb7 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: ; GCN-NOT: s6 -; GCN: s_mov_b32 s12, s6 +; GCN: s_mov_b32 s12, s4 ; GCN: s_mov_b32 s32, 0 -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x@rel32@hi+12 +; GCN: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+12 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm @@ -216,7 +216,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y: ; GCN-NOT: s12 -; GCN: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s13, s5 ; GCN-NOT: s12 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -232,7 +232,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z: ; GCN-NOT: s12 ; GCN-NOT: s13 -; GCN: s_mov_b32 s14, s7 +; GCN: s_mov_b32 s14, s5 ; GCN-NOT: s12 ; GCN-NOT: s13 @@ -249,8 +249,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy: ; GCN-NOT: s14 -; GCN: s_mov_b32 s12, s6 -; GCN-NEXT: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s5 ; GCN-NOT: s14 ; GCN: s_mov_b32 s32, 0 @@ -265,9 +265,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { } ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz: -; GCN: s_mov_b32 s12, s6 -; GCN: s_mov_b32 s13, s7 -; GCN: s_mov_b32 s14, s8 +; GCN: s_mov_b32 s12, s4 +; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s14, s6 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -282,8 +282,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz: ; GCN-NOT: s13 -; GCN: s_mov_b32 s12, s6 -; GCN-NEXT: s_mov_b32 s14, s7 +; GCN: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s14, s5 ; GCN-NOT: s13 ; GCN: s_mov_b32 s32, 0 @@ -299,8 +299,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz: -; GCN: s_mov_b32 s13, s7 -; GCN: s_mov_b32 s14, s8 +; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s14, s6 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -381,7 +381,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { ; GCN-NOT: s13 ; GCN-NOT: s14 -; GCN-DAG: s_mov_b32 s12, s6 +; GCN-DAG: s_mov_b32 s12, s4 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: s13 ; GCN-NOT: s14 @@ -399,7 +399,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s13, s7 +; GCN-DAG: s_mov_b32 s13, s5 ; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -414,7 +414,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s14, s7 +; GCN-DAG: s_mov_b32 s14, s5 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index aa182b720c604..ebca990699878 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 848019c872925..074dba1cbcc93 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 0ad53083d0ff3..eb3d356ea59b4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 { } ; GCN-LABEL: {{^}}store_flat_i64: -; GCN: flat_store_{{dwordx2|b64}} +; GCN: flat_store_{{dword|b64}} define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr store volatile i64 %x, ptr %fptr, align 8 @@ -28,7 +28,7 @@ define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 { } ; GCN-LABEL: {{^}}store_flat_v4i32: -; GCN: flat_store_{{dwordx4|b128}} +; GCN: flat_store_{{dword|b128}} define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr store volatile <4 x i32> %x, ptr %fptr, align 16 @@ -65,7 +65,7 @@ define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addr } ; GCN-LABEL: load_flat_i64: -; GCN: flat_load_{{dwordx2|b64}} +; GCN: flat_load_{{dword|b64}} define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr %fload = load volatile i64, ptr %fptr, align 8 @@ -74,7 +74,7 @@ define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addr } ; GCN-LABEL: load_flat_v4i32: -; GCN: flat_load_{{dwordx4|b128}} +; GCN: flat_load_{{dword|b128}} define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr %fload = load volatile <4 x i32>, ptr %fptr, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index b89dbd42e0466..c998a4b19121e 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -10,9 +10,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 -; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 +; GFX8V4-NEXT: s_add_i32 s8, s8, s11 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s9 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0 @@ -22,6 +24,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX8V4-NEXT: s_cselect_b32 s0, s2, 0 ; GFX8V4-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX8V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v4 @@ -33,9 +36,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 -; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 +; GFX8V5-NEXT: s_add_i32 s6, s6, s9 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s7 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0 @@ -45,6 +50,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: v_mov_b32_e32 v1, s2 ; GFX8V5-NEXT: s_cselect_b32 s0, s3, 0 ; GFX8V5-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX8V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v4 @@ -56,10 +62,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0 @@ -69,6 +76,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: v_mov_b32_e32 v1, s2 ; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V4-NEXT: flat_store_dword v[0:1], v4 @@ -80,10 +88,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0 @@ -93,6 +102,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: v_mov_b32_e32 v1, s2 ; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V5-NEXT: flat_store_dword v[0:1], v4 @@ -101,6 +111,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: flat_store_dword v[2:3], v0 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm + %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr store volatile i32 1, ptr %flat.private diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll index 4c21f87297455..5979f1bbacdd1 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll @@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ;. ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 957f404c8cdbe..a2f55d6aa8396 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ define hidden void @func() #1 { ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; TotalNumSgprs: 37 +; GCN: ; NumSgprs: 33 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index d51ace630f692..aebc8315514fb 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -113,24 +113,16 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -176,24 +168,16 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -239,24 +223,16 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -302,24 +278,16 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -352,12 +320,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -371,8 +334,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 0 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v4, v3 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -385,24 +348,15 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 4 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -429,12 +383,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -448,8 +397,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 2 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v4, v3 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -462,24 +411,15 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 6 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -506,12 +446,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -525,8 +460,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v4, v3 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -539,24 +474,15 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 5 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -583,12 +509,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -602,8 +523,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 3 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v4, v3 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -616,24 +537,15 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 7 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index c201f84cac726..8fadfe3d02666 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -42,28 +42,19 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-LABEL: indirect_lds_id: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s8, s6, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_addc_u32 s9, s7, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, function_lds_id@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, function_lds_id@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, function_lds_id@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GCN-NEXT: s_mov_b32 s15, 21 +; GCN-NEXT: s_mov_b32 s12, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 9bbcc6988e311..05ad567478675 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -161,10 +161,7 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -200,10 +197,7 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -250,10 +244,7 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index 72a0aceaae12b..3453ff9d296c0 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -226,10 +226,7 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -273,10 +270,7 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -323,10 +317,7 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 1c33d8a19890d..7868fa9a7ce4c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -202,6 +217,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX7-LABEL: flat_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -389,6 +419,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_load( ; GFX7-LABEL: flat_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -405,6 +438,10 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -423,6 +460,10 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -456,6 +497,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -471,6 +514,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -591,6 +636,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX7-LABEL: flat_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -608,6 +656,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -628,6 +680,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -664,6 +720,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -680,6 +738,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -819,6 +879,9 @@ entry: define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-LABEL: flat_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -830,6 +893,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -842,6 +909,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -865,6 +936,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -876,6 +949,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -959,6 +1034,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-LABEL: flat_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -970,6 +1048,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -982,6 +1064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1005,6 +1091,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1016,6 +1104,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1099,6 +1189,9 @@ entry: define amdgpu_kernel void @flat_agent_release_store( ; GFX7-LABEL: flat_agent_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1111,6 +1204,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-WGP-LABEL: flat_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1125,6 +1222,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-CU-LABEL: flat_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1151,6 +1252,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1163,6 +1266,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1265,6 +1370,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-LABEL: flat_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1277,6 +1385,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1291,6 +1403,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1317,6 +1433,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1329,6 +1447,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1431,6 +1551,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1443,6 +1566,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1456,6 +1583,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1481,6 +1612,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1493,6 +1626,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1585,6 +1720,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1599,6 +1737,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1616,6 +1758,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1646,6 +1792,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1660,6 +1808,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1770,6 +1920,9 @@ entry: define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX7-LABEL: flat_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1783,6 +1936,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1798,6 +1955,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1826,6 +1987,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1839,6 +2002,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1950,6 +2115,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1965,6 +2133,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1984,6 +2156,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2017,6 +2193,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2032,6 +2210,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2161,6 +2341,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2176,6 +2359,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2195,6 +2382,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2228,6 +2419,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2243,6 +2436,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2372,6 +2567,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2388,6 +2586,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2406,6 +2608,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2439,6 +2645,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2454,6 +2662,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2572,6 +2782,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2589,6 +2802,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2609,6 +2826,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2645,6 +2866,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2661,6 +2884,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2802,6 +3027,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2819,6 +3047,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2839,6 +3071,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2875,6 +3111,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2891,6 +3129,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3032,6 +3272,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3057,6 +3300,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3082,6 +3329,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3132,6 +3383,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3147,6 +3400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3265,6 +3520,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3292,6 +3550,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3321,6 +3583,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3376,6 +3642,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3393,6 +3661,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3529,6 +3799,9 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3555,6 +3828,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3582,6 +3859,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3635,6 +3916,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3651,6 +3934,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3788,6 +4073,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3816,6 +4104,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3847,6 +4139,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3905,6 +4201,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3923,6 +4221,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4078,6 +4378,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4106,6 +4409,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4137,6 +4444,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4195,6 +4506,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4213,6 +4526,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4368,6 +4683,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4395,6 +4713,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4424,6 +4746,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4479,6 +4805,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4496,6 +4824,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4632,6 +4962,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4659,6 +4992,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4688,6 +5025,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4743,6 +5084,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4760,6 +5103,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4896,6 +5241,9 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4924,6 +5272,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4955,6 +5307,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5013,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5031,6 +5389,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5186,6 +5546,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5214,6 +5577,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5245,6 +5612,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5303,6 +5674,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5321,6 +5694,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5476,6 +5851,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5504,6 +5882,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5535,6 +5917,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5593,6 +5979,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5611,6 +5999,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5766,6 +6156,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5794,6 +6187,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5825,6 +6222,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5883,6 +6284,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5901,6 +6304,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6056,6 +6461,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6084,6 +6492,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6115,6 +6527,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6173,6 +6589,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6191,6 +6609,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6346,6 +6766,9 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6374,6 +6797,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6405,6 +6832,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6463,6 +6894,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6481,6 +6914,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6636,6 +7071,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6664,6 +7102,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6695,6 +7137,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6753,6 +7199,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6771,6 +7219,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6926,6 +7376,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6954,6 +7407,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6985,6 +7442,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7043,6 +7504,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7061,6 +7524,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7216,6 +7681,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7244,6 +7712,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7272,6 +7744,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7328,6 +7804,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7346,6 +7824,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7491,6 +7971,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7520,6 +8003,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7550,6 +8037,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7608,6 +8099,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7627,6 +8120,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7781,6 +8276,9 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7810,6 +8308,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7840,6 +8342,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7899,6 +8405,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7918,6 +8426,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8082,6 +8592,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8112,6 +8625,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8144,6 +8661,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8205,6 +8726,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8225,6 +8748,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8402,6 +8927,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8432,6 +8960,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8464,6 +8996,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8525,6 +9061,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8545,6 +9083,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8722,6 +9262,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8751,6 +9294,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8781,6 +9328,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8839,6 +9390,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8858,6 +9411,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9016,6 +9571,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9045,6 +9603,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9075,6 +9637,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9133,6 +9699,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9152,6 +9720,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9306,6 +9876,9 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9336,6 +9909,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9368,6 +9945,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9429,6 +10010,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9449,6 +10032,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9626,6 +10211,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9656,6 +10244,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9688,6 +10280,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9749,6 +10345,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9769,6 +10367,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9946,6 +10546,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9976,6 +10579,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10008,6 +10615,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10069,6 +10680,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10089,6 +10702,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10266,6 +10881,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10296,6 +10914,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10328,6 +10950,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10389,6 +11015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10409,6 +11037,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10586,6 +11216,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10616,6 +11249,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10648,6 +11285,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10709,6 +11350,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10729,6 +11372,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10902,6 +11547,9 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10932,6 +11580,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10964,6 +11616,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11025,6 +11681,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11045,6 +11703,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11222,6 +11882,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11252,6 +11915,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11284,6 +11951,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11345,6 +12016,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11365,6 +12038,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11542,6 +12217,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11572,6 +12250,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11604,6 +12286,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11665,6 +12351,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11685,6 +12373,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11862,6 +12552,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX7-LABEL: flat_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11877,6 +12570,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11893,6 +12590,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11924,6 +12625,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11938,6 +12641,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12049,6 +12754,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX7-LABEL: flat_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12064,6 +12772,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12080,6 +12792,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12111,6 +12827,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12125,6 +12843,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12236,6 +12956,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX7-LABEL: flat_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12253,6 +12976,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12272,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12307,6 +13038,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12323,6 +13056,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12448,6 +13183,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX7-LABEL: flat_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12466,6 +13204,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12487,6 +13229,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12525,6 +13271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12542,6 +13290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12686,6 +13436,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-LABEL: flat_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12697,6 +13450,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12709,6 +13466,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12732,6 +13493,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12743,6 +13506,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12826,6 +13591,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-LABEL: flat_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12837,6 +13605,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12849,6 +13621,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12872,6 +13648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12883,6 +13661,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12966,6 +13746,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-LABEL: flat_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12978,6 +13761,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12992,6 +13779,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13018,6 +13809,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13030,6 +13823,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13132,6 +13927,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-LABEL: flat_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13144,6 +13942,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13158,6 +13960,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13184,6 +13990,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13196,6 +14004,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13298,6 +14108,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13310,6 +14123,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13323,6 +14140,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13348,6 +14169,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13360,6 +14183,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13452,6 +14277,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13466,6 +14294,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13482,6 +14314,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13511,6 +14347,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13525,6 +14363,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13633,6 +14473,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13646,6 +14489,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13661,6 +14508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13689,6 +14540,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13702,6 +14555,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13813,6 +14668,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13828,6 +14686,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13846,6 +14708,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13878,6 +14744,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13893,6 +14761,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14020,6 +14890,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14035,6 +14908,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14053,6 +14930,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14085,6 +14966,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14100,6 +14983,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14227,6 +15112,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14244,6 +15132,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14263,6 +15155,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14298,6 +15194,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14314,6 +15212,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14437,6 +15337,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14455,6 +15358,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14476,6 +15383,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14514,6 +15425,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14531,6 +15444,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14677,6 +15592,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14695,6 +15613,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14716,6 +15638,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14754,6 +15680,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14771,6 +15699,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14917,6 +15847,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14942,6 +15875,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14967,6 +15904,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15017,6 +15958,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15032,6 +15975,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15150,6 +16095,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15177,6 +16125,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15205,6 +16157,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15259,6 +16215,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15276,6 +16234,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15410,6 +16370,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15436,6 +16399,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15463,6 +16430,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15516,6 +16487,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15532,6 +16505,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15669,6 +16644,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15697,6 +16675,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15727,6 +16709,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15784,6 +16770,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15802,6 +16790,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15955,6 +16945,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15983,6 +16976,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16013,6 +17010,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16070,6 +17071,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16088,6 +17091,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16241,6 +17246,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16268,6 +17276,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16296,6 +17308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16350,6 +17366,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16367,6 +17385,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16501,6 +17521,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16528,6 +17551,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16556,6 +17583,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16610,6 +17641,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16627,6 +17660,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16761,6 +17796,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16789,6 +17827,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16819,6 +17861,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16876,6 +17922,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16894,6 +17942,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17047,6 +18097,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17075,6 +18128,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17105,6 +18162,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17162,6 +18223,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17180,6 +18243,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17333,6 +18398,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17361,6 +18429,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17391,6 +18463,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17448,6 +18524,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17466,6 +18544,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17619,6 +18699,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17647,6 +18730,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17677,6 +18764,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17734,6 +18825,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17752,6 +18845,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17905,6 +19000,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17933,6 +19031,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17963,6 +19065,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18020,6 +19126,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18038,6 +19146,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18191,6 +19301,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18219,6 +19332,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18249,6 +19366,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18306,6 +19427,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18324,6 +19447,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18477,6 +19602,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18505,6 +19633,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18535,6 +19667,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18592,6 +19728,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18610,6 +19748,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18763,6 +19903,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18791,6 +19934,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18821,6 +19968,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18878,6 +20029,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18896,6 +20049,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19049,6 +20204,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19077,6 +20235,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19105,6 +20267,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19161,6 +20327,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19179,6 +20347,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19324,6 +20494,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19354,6 +20527,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19385,6 +20562,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19445,6 +20626,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19465,6 +20648,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19624,6 +20809,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19653,6 +20841,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19683,6 +20875,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19742,6 +20938,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19761,6 +20959,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19925,6 +21125,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19956,6 +21159,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19989,6 +21196,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20052,6 +21263,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20073,6 +21286,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20255,6 +21470,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20286,6 +21504,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20319,6 +21541,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20382,6 +21608,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20403,6 +21631,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20585,6 +21815,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20615,6 +21848,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20646,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20706,6 +21947,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20726,6 +21969,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20889,6 +22134,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20919,6 +22167,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20950,6 +22202,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21010,6 +22266,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21030,6 +22288,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21189,6 +22449,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21220,6 +22483,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21253,6 +22520,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21316,6 +22587,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21337,6 +22610,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21519,6 +22794,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21550,6 +22828,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21583,6 +22865,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21646,6 +22932,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21667,6 +22955,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21849,6 +23139,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21880,6 +23173,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21913,6 +23210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21976,6 +23277,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21997,6 +23300,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22179,6 +23484,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22210,6 +23518,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22243,6 +23555,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22306,6 +23622,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22327,6 +23645,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22509,6 +23829,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22540,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22573,6 +23900,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22636,6 +23967,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22657,6 +23990,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22835,6 +24170,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22866,6 +24204,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22899,6 +24241,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22962,6 +24308,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22983,6 +24331,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23165,6 +24515,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23196,6 +24549,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23229,6 +24586,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23292,6 +24653,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23313,6 +24676,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23495,6 +24860,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23526,6 +24894,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23559,6 +24931,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23622,6 +24998,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23643,6 +25021,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index b2340caa2933f..f189562bafe5f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -202,6 +217,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -230,6 +248,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 @@ -257,6 +279,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 @@ -312,6 +338,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff @@ -341,6 +369,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff @@ -551,6 +581,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -566,6 +599,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -582,6 +619,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -613,6 +654,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -627,6 +670,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -738,6 +783,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -766,6 +814,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -794,6 +846,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -850,6 +906,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -880,6 +938,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1091,6 +1151,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX7-LABEL: flat_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -1107,6 +1170,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1124,6 +1191,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: flat_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1157,6 +1228,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1172,6 +1245,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 304c80d7bb24d..089d2a69facc1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -202,6 +217,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX7-LABEL: flat_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -389,6 +419,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-LABEL: flat_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -404,6 +437,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -420,6 +457,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -451,6 +492,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -465,6 +508,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -576,6 +621,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-LABEL: flat_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -591,6 +639,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -607,6 +659,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -638,6 +694,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -652,6 +710,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -763,6 +823,9 @@ entry: define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-LABEL: flat_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -774,6 +837,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -786,6 +853,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -809,6 +880,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -820,6 +893,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -903,6 +978,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-LABEL: flat_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -914,6 +992,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -926,6 +1008,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -949,6 +1035,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -960,6 +1048,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1043,6 +1133,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-LABEL: flat_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1054,6 +1147,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1066,6 +1163,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1089,6 +1190,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1100,6 +1203,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1183,6 +1288,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-LABEL: flat_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1194,6 +1302,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1206,6 +1318,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1229,6 +1345,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1240,6 +1358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1323,6 +1443,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1335,6 +1458,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1348,6 +1475,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1373,6 +1504,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1385,6 +1518,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1477,6 +1612,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1489,6 +1627,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1502,6 +1644,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1527,6 +1673,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1539,6 +1687,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1631,6 +1781,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1643,6 +1796,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1656,6 +1813,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1681,6 +1842,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1693,6 +1856,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1785,6 +1950,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1797,6 +1965,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1810,6 +1982,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1835,6 +2011,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1847,6 +2025,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1939,6 +2119,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1951,6 +2134,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1964,6 +2151,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1989,6 +2180,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2001,6 +2194,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2093,6 +2288,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2108,6 +2306,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2124,6 +2326,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2155,6 +2361,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2169,6 +2377,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2278,6 +2488,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2293,6 +2506,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2309,6 +2526,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2340,6 +2561,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2354,6 +2577,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2463,6 +2688,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2478,6 +2706,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2494,6 +2726,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2525,6 +2761,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2539,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2648,6 +2888,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2673,6 +2916,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2698,6 +2945,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2748,6 +2999,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2763,6 +3016,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2881,6 +3136,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2906,6 +3164,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2931,6 +3193,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2981,6 +3247,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2996,6 +3264,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3114,6 +3384,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3139,6 +3412,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3164,6 +3441,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3214,6 +3495,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3229,6 +3512,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3347,6 +3632,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3372,6 +3660,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3397,6 +3689,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3447,6 +3743,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3462,6 +3760,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3580,6 +3880,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3605,6 +3908,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3630,6 +3937,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3680,6 +3991,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3695,6 +4008,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3813,6 +4128,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3838,6 +4156,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3863,6 +4185,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3913,6 +4239,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3928,6 +4256,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4046,6 +4376,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4071,6 +4404,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4096,6 +4433,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4146,6 +4487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4161,6 +4504,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4279,6 +4624,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4304,6 +4652,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4329,6 +4681,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4379,6 +4735,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4394,6 +4752,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4512,6 +4872,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4537,6 +4900,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4562,6 +4929,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4612,6 +4983,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4627,6 +5000,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4745,6 +5120,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4770,6 +5148,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4795,6 +5177,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4845,6 +5231,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4860,6 +5248,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4978,6 +5368,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5003,6 +5396,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5028,6 +5425,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5078,6 +5479,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5093,6 +5496,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5211,6 +5616,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5236,6 +5644,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5261,6 +5673,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5311,6 +5727,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5326,6 +5744,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5444,6 +5864,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5469,6 +5892,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5494,6 +5921,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5544,6 +5975,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5559,6 +5992,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5677,6 +6112,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5702,6 +6140,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5727,6 +6169,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5777,6 +6223,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5792,6 +6240,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5910,6 +6360,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5935,6 +6388,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5960,6 +6417,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6010,6 +6471,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6025,6 +6488,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6143,6 +6608,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6171,6 +6639,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6199,6 +6671,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6255,6 +6731,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6273,6 +6751,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6418,6 +6898,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6446,6 +6929,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6474,6 +6961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6530,6 +7021,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6548,6 +7041,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6693,6 +7188,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6721,6 +7219,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6749,6 +7251,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6805,6 +7311,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6823,6 +7331,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6968,6 +7478,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6996,6 +7509,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7024,6 +7541,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7080,6 +7601,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7098,6 +7621,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7243,6 +7768,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7271,6 +7799,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7299,6 +7831,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7355,6 +7891,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7373,6 +7911,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7518,6 +8058,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7546,6 +8089,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7574,6 +8121,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7630,6 +8181,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7648,6 +8201,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7793,6 +8348,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7821,6 +8379,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7849,6 +8411,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7905,6 +8471,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7923,6 +8491,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8068,6 +8638,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8096,6 +8669,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8124,6 +8701,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8180,6 +8761,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8198,6 +8781,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8343,6 +8928,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8371,6 +8959,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8399,6 +8991,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8455,6 +9051,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8473,6 +9071,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8618,6 +9218,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8646,6 +9249,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8674,6 +9281,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8730,6 +9341,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8748,6 +9361,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8893,6 +9508,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8921,6 +9539,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8949,6 +9571,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9005,6 +9631,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9023,6 +9651,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9168,6 +9798,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9196,6 +9829,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9224,6 +9861,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9280,6 +9921,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9298,6 +9941,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9443,6 +10088,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9471,6 +10119,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9499,6 +10151,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9555,6 +10211,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9573,6 +10231,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9718,6 +10378,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9746,6 +10409,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9774,6 +10441,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9830,6 +10501,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9848,6 +10521,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9993,6 +10668,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10021,6 +10699,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10049,6 +10731,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10105,6 +10791,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10123,6 +10811,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10268,6 +10958,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX7-LABEL: flat_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10283,6 +10976,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10299,6 +10996,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10330,6 +11031,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10344,6 +11047,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10455,6 +11160,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10470,6 +11178,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10486,6 +11198,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10517,6 +11233,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10531,6 +11249,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10642,6 +11362,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-LABEL: flat_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10657,6 +11380,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10673,6 +11400,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10704,6 +11435,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10718,6 +11451,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10829,6 +11564,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10844,6 +11582,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10860,6 +11602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10891,6 +11637,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10905,6 +11653,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11016,6 +11766,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-LABEL: flat_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11027,6 +11780,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11039,6 +11796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11062,6 +11823,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11073,6 +11836,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11156,6 +11921,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11167,6 +11935,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11179,6 +11951,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11202,6 +11978,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11213,6 +11991,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11296,6 +12076,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-LABEL: flat_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11307,6 +12090,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11319,6 +12106,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11342,6 +12133,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11353,6 +12146,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11436,6 +12231,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11447,6 +12245,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11459,6 +12261,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11482,6 +12288,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11493,6 +12301,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11576,6 +12386,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11588,6 +12401,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11601,6 +12418,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11626,6 +12447,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11638,6 +12461,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11730,6 +12555,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11742,6 +12570,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11755,6 +12587,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11780,6 +12616,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11792,6 +12630,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11884,6 +12724,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11896,6 +12739,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11909,6 +12756,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11934,6 +12785,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11946,6 +12799,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12038,6 +12893,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12050,6 +12908,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12063,6 +12925,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12088,6 +12954,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12100,6 +12968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12192,6 +13062,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12204,6 +13077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12217,6 +13094,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12242,6 +13123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12254,6 +13137,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12346,6 +13231,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12361,6 +13249,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12377,6 +13269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12408,6 +13304,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12422,6 +13320,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12531,6 +13431,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12546,6 +13449,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12562,6 +13469,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12593,6 +13504,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12607,6 +13520,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12716,6 +13631,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12731,6 +13649,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12747,6 +13669,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12778,6 +13704,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12792,6 +13720,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12901,6 +13831,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12926,6 +13859,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12951,6 +13888,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13001,6 +13942,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13016,6 +13959,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13134,6 +14079,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13159,6 +14107,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13184,6 +14136,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13234,6 +14190,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13249,6 +14207,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13367,6 +14327,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13392,6 +14355,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13417,6 +14384,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13467,6 +14438,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13482,6 +14455,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13600,6 +14575,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13625,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13650,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13700,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13715,6 +14703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13833,6 +14823,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13858,6 +14851,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13883,6 +14880,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13933,6 +14934,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13948,6 +14951,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14066,6 +15071,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14091,6 +15099,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14116,6 +15128,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14166,6 +15182,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14181,6 +15199,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14299,6 +15319,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14324,6 +15347,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14349,6 +15376,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14399,6 +15430,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14414,6 +15447,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14532,6 +15567,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14557,6 +15595,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14582,6 +15624,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14632,6 +15678,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14647,6 +15695,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14765,6 +15815,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14790,6 +15843,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14815,6 +15872,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14865,6 +15926,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14880,6 +15943,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14998,6 +16063,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15023,6 +16091,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15048,6 +16120,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15098,6 +16174,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15113,6 +16191,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15231,6 +16311,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15256,6 +16339,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15281,6 +16368,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15331,6 +16422,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15346,6 +16439,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15464,6 +16559,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15489,6 +16587,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15514,6 +16616,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15564,6 +16670,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15579,6 +16687,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15697,6 +16807,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15722,6 +16835,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15747,6 +16864,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15797,6 +16918,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15812,6 +16935,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15930,6 +17055,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15955,6 +17083,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15980,6 +17112,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16030,6 +17166,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16045,6 +17183,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16163,6 +17303,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16188,6 +17331,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16213,6 +17360,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16263,6 +17414,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16278,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16396,6 +17551,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16424,6 +17582,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16452,6 +17614,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16508,6 +17674,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16526,6 +17694,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16671,6 +17841,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16699,6 +17872,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16727,6 +17904,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16783,6 +17964,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16801,6 +17984,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16946,6 +18131,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16974,6 +18162,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17002,6 +18194,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17058,6 +18254,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17076,6 +18274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17221,6 +18421,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17249,6 +18452,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17277,6 +18484,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17333,6 +18544,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17351,6 +18564,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17496,6 +18711,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17524,6 +18742,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17552,6 +18774,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17608,6 +18834,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17626,6 +18854,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17771,6 +19001,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17799,6 +19032,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17827,6 +19064,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17883,6 +19124,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17901,6 +19144,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18046,6 +19291,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18074,6 +19322,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18102,6 +19354,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18158,6 +19414,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18176,6 +19434,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18321,6 +19581,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18349,6 +19612,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18377,6 +19644,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18433,6 +19704,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18451,6 +19724,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18596,6 +19871,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18624,6 +19902,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18652,6 +19934,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18708,6 +19994,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18726,6 +20014,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18871,6 +20161,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18899,6 +20192,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18927,6 +20224,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18983,6 +20284,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19001,6 +20304,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19146,6 +20451,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19174,6 +20482,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19202,6 +20514,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19258,6 +20574,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19276,6 +20594,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19421,6 +20741,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19449,6 +20772,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19477,6 +20804,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19533,6 +20864,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19551,6 +20884,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19696,6 +21031,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19724,6 +21062,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19752,6 +21094,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19808,6 +21154,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19826,6 +21174,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19971,6 +21321,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19999,6 +21352,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20027,6 +21384,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20083,6 +21444,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20101,6 +21464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20246,6 +21611,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20274,6 +21642,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20302,6 +21674,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20358,6 +21734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20376,6 +21754,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 038b58deb0cf1..69c03ba6a3979 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-CU-LABEL: flat_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -202,6 +217,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_load( ; GFX7-LABEL: flat_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -389,6 +419,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_load( ; GFX7-LABEL: flat_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -405,6 +438,10 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -423,6 +460,10 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-CU-LABEL: flat_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -456,6 +497,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -472,6 +515,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -593,6 +638,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX7-LABEL: flat_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -610,6 +658,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -630,6 +682,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -666,6 +722,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -683,6 +741,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -823,6 +883,9 @@ entry: define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-LABEL: flat_system_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -834,6 +897,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -846,6 +913,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-CU-LABEL: flat_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -869,6 +940,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -880,6 +953,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -963,6 +1038,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-LABEL: flat_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -974,6 +1052,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -986,6 +1068,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1009,6 +1095,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1020,6 +1108,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1103,6 +1193,9 @@ entry: define amdgpu_kernel void @flat_system_release_store( ; GFX7-LABEL: flat_system_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1115,6 +1208,10 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-WGP-LABEL: flat_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1129,6 +1226,10 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-CU-LABEL: flat_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1155,6 +1256,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1168,6 +1271,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1271,6 +1376,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-LABEL: flat_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1283,6 +1391,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1297,6 +1409,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1323,6 +1439,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1336,6 +1454,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1439,6 +1559,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1451,6 +1574,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1464,6 +1591,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1489,6 +1620,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1501,6 +1634,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1593,6 +1728,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX7-LABEL: flat_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1607,6 +1745,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1624,6 +1766,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1654,6 +1800,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1669,6 +1817,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1780,6 +1930,9 @@ entry: define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX7-LABEL: flat_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1793,6 +1946,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1808,6 +1965,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1836,6 +1997,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1850,6 +2013,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1962,6 +2127,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1977,6 +2145,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1996,6 +2168,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2029,6 +2205,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2046,6 +2224,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2177,6 +2357,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2192,6 +2375,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2211,6 +2398,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2244,6 +2435,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2261,6 +2454,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2392,6 +2587,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2408,6 +2606,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2426,6 +2628,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2459,6 +2665,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2475,6 +2683,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2594,6 +2804,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2611,6 +2824,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2631,6 +2848,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2667,6 +2888,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2685,6 +2908,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2828,6 +3053,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2845,6 +3073,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2865,6 +3097,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2901,6 +3137,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2919,6 +3157,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3062,6 +3302,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3087,6 +3330,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3112,6 +3359,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3162,6 +3413,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3177,6 +3430,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3295,6 +3550,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3322,6 +3580,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3351,6 +3613,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3406,6 +3672,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3424,6 +3692,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3561,6 +3831,9 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3587,6 +3860,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3614,6 +3891,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3667,6 +3948,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3684,6 +3967,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3822,6 +4107,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3850,6 +4138,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3881,6 +4173,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3939,6 +4235,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3959,6 +4257,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4116,6 +4416,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4144,6 +4447,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4175,6 +4482,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4233,6 +4544,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4253,6 +4566,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4410,6 +4725,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4437,6 +4755,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4466,6 +4788,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4521,6 +4847,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4539,6 +4867,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4676,6 +5006,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4703,6 +5036,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4732,6 +5069,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4787,6 +5128,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4805,6 +5148,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4942,6 +5287,9 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4970,6 +5318,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5001,6 +5353,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5059,6 +5415,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5079,6 +5437,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5236,6 +5596,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5264,6 +5627,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5295,6 +5662,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5353,6 +5724,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5373,6 +5746,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5530,6 +5905,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5558,6 +5936,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5589,6 +5971,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5647,6 +6033,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5667,6 +6055,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5824,6 +6214,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5852,6 +6245,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5883,6 +6280,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5941,6 +6342,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5961,6 +6364,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6118,6 +6523,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6146,6 +6554,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6177,6 +6589,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6235,6 +6651,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6255,6 +6673,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6412,6 +6832,9 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6440,6 +6863,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6471,6 +6898,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6529,6 +6960,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6549,6 +6982,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6706,6 +7141,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6734,6 +7172,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6765,6 +7207,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6823,6 +7269,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6843,6 +7291,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7000,6 +7450,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7028,6 +7481,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7059,6 +7516,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7117,6 +7578,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7137,6 +7600,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7294,6 +7759,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7322,6 +7790,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7350,6 +7822,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7406,6 +7882,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7424,6 +7902,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7569,6 +8049,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7598,6 +8081,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7628,6 +8115,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7686,6 +8177,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7706,6 +8199,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7861,6 +8356,9 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7890,6 +8388,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7920,6 +8422,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7979,6 +8485,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7999,6 +8507,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8164,6 +8674,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8194,6 +8707,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8226,6 +8743,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8287,6 +8808,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8309,6 +8832,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8488,6 +9013,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8518,6 +9046,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8550,6 +9082,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8611,6 +9147,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8633,6 +9171,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8812,6 +9352,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8841,6 +9384,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8871,6 +9418,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8929,6 +9480,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8949,6 +9502,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9108,6 +9663,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9137,6 +9695,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9167,6 +9729,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9225,6 +9791,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9245,6 +9813,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9400,6 +9970,9 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9430,6 +10003,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9462,6 +10039,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9523,6 +10104,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9545,6 +10128,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9724,6 +10309,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9754,6 +10342,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9786,6 +10378,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9847,6 +10443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9869,6 +10467,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10048,6 +10648,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10078,6 +10681,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10110,6 +10717,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10171,6 +10782,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10193,6 +10806,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10372,6 +10987,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10402,6 +11020,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10434,6 +11056,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10495,6 +11121,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10517,6 +11145,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10696,6 +11326,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10726,6 +11359,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10758,6 +11395,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10819,6 +11460,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10841,6 +11484,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11016,6 +11661,9 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11046,6 +11694,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11078,6 +11730,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11139,6 +11795,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11161,6 +11819,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11340,6 +12000,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11370,6 +12033,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11402,6 +12069,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11463,6 +12134,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11485,6 +12158,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11664,6 +12339,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11694,6 +12372,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11726,6 +12408,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11787,6 +12473,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11809,6 +12497,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11988,6 +12678,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX7-LABEL: flat_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12003,6 +12696,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12019,6 +12716,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12050,6 +12751,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12064,6 +12767,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12175,6 +12880,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX7-LABEL: flat_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12190,6 +12898,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12206,6 +12918,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12237,6 +12953,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12251,6 +12969,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12362,6 +13082,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX7-LABEL: flat_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12379,6 +13102,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12398,6 +13125,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12433,6 +13164,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12450,6 +13183,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12576,6 +13311,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX7-LABEL: flat_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12594,6 +13332,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12615,6 +13357,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12653,6 +13399,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12671,6 +13419,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12816,6 +13566,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-LABEL: flat_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12827,6 +13580,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12839,6 +13596,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12862,6 +13623,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12873,6 +13636,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12956,6 +13721,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-LABEL: flat_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12967,6 +13735,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12979,6 +13751,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13002,6 +13778,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13013,6 +13791,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13096,6 +13876,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-LABEL: flat_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13108,6 +13891,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13122,6 +13909,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-CU-LABEL: flat_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13148,6 +13939,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13161,6 +13954,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13264,6 +14059,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-LABEL: flat_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13276,6 +14074,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13290,6 +14092,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13316,6 +14122,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13329,6 +14137,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13432,6 +14242,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13444,6 +14257,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13457,6 +14274,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13482,6 +14303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13494,6 +14317,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13586,6 +14411,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13600,6 +14428,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13616,6 +14448,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13645,6 +14481,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13660,6 +14498,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13769,6 +14609,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX7-LABEL: flat_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13782,6 +14625,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13797,6 +14644,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13825,6 +14676,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13839,6 +14692,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13951,6 +14806,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13966,6 +14824,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13984,6 +14846,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14016,6 +14882,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14033,6 +14901,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14162,6 +15032,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14177,6 +15050,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14195,6 +15072,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14227,6 +15108,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14244,6 +15127,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14373,6 +15258,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14390,6 +15278,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14409,6 +15301,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14444,6 +15340,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14461,6 +15359,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14585,6 +15485,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14603,6 +15506,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14624,6 +15531,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14662,6 +15573,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14681,6 +15594,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14829,6 +15744,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14847,6 +15765,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14868,6 +15790,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14906,6 +15832,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -14925,6 +15853,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15073,6 +16003,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15098,6 +16031,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15123,6 +16060,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15173,6 +16114,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15188,6 +16131,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15306,6 +16251,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15333,6 +16281,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15361,6 +16313,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15415,6 +16371,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15433,6 +16391,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15568,6 +16528,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15594,6 +16557,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15621,6 +16588,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15674,6 +16645,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15691,6 +16664,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15829,6 +16804,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15857,6 +16835,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15887,6 +16869,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15944,6 +16930,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15964,6 +16952,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16119,6 +17109,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16147,6 +17140,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16177,6 +17174,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16234,6 +17235,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16254,6 +17257,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16409,6 +17414,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16436,6 +17444,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16464,6 +17476,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16518,6 +17534,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16536,6 +17554,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16671,6 +17691,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16698,6 +17721,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16726,6 +17753,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16780,6 +17811,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16798,6 +17831,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16933,6 +17968,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16961,6 +17999,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16991,6 +18033,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17048,6 +18094,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17068,6 +18116,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17223,6 +18273,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17251,6 +18304,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17281,6 +18338,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17338,6 +18399,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17358,6 +18421,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17513,6 +18578,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17541,6 +18609,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17571,6 +18643,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17628,6 +18704,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17648,6 +18726,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17803,6 +18883,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17831,6 +18914,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17861,6 +18948,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17918,6 +19009,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17938,6 +19031,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18093,6 +19188,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18121,6 +19219,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18151,6 +19253,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18208,6 +19314,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18228,6 +19336,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18383,6 +19493,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18411,6 +19524,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18441,6 +19558,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18498,6 +19619,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18518,6 +19641,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18673,6 +19798,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18701,6 +19829,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18731,6 +19863,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18788,6 +19924,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18808,6 +19946,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18963,6 +20103,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18991,6 +20134,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19021,6 +20168,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19078,6 +20229,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19098,6 +20251,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19253,6 +20408,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19281,6 +20439,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19309,6 +20471,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19365,6 +20531,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19383,6 +20551,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19528,6 +20698,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19558,6 +20731,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19589,6 +20766,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19649,6 +20830,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19670,6 +20853,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19830,6 +21015,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19859,6 +21047,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19889,6 +21081,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19948,6 +21144,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19968,6 +21166,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20133,6 +21333,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20164,6 +21367,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20197,6 +21404,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20260,6 +21471,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20283,6 +21496,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20467,6 +21682,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20498,6 +21716,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20531,6 +21753,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20594,6 +21820,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20617,6 +21845,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20801,6 +22031,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20831,6 +22064,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20862,6 +22099,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20922,6 +22163,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20943,6 +22186,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21107,6 +22352,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21137,6 +22385,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21168,6 +22420,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21228,6 +22484,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21249,6 +22507,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21409,6 +22669,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21440,6 +22703,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21473,6 +22740,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21536,6 +22807,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21559,6 +22832,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21743,6 +23018,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21774,6 +23052,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21807,6 +23089,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21870,6 +23156,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21893,6 +23181,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22077,6 +23367,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22108,6 +23401,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22141,6 +23438,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22204,6 +23505,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22227,6 +23530,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22411,6 +23716,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22442,6 +23750,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22475,6 +23787,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22538,6 +23854,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22561,6 +23879,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22745,6 +24065,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22776,6 +24099,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22809,6 +24136,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22872,6 +24203,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22895,6 +24228,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23075,6 +24410,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23106,6 +24444,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23139,6 +24481,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23202,6 +24548,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23225,6 +24573,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23409,6 +24759,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23440,6 +24793,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23473,6 +24830,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23536,6 +24897,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23559,6 +24922,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23743,6 +25108,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23774,6 +25142,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23807,6 +25179,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23870,6 +25246,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23893,6 +25271,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index f1b465c1789da..5f788e2e41ac5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -11,6 +11,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -27,6 +30,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -44,6 +51,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -154,6 +165,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -183,6 +197,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 @@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 @@ -411,6 +433,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -427,6 +452,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -444,6 +473,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -558,6 +591,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -587,6 +623,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -616,6 +656,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -821,6 +865,9 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-LABEL: flat_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -837,6 +884,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -854,6 +905,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -960,6 +1015,9 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-LABEL: flat_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -972,6 +1030,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -986,6 +1048,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 23982f8a00cdb..dad713198cc89 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -202,6 +217,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX7-LABEL: flat_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -389,6 +419,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-LABEL: flat_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -404,6 +437,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -420,6 +457,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -451,6 +492,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -465,6 +508,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -576,6 +621,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-LABEL: flat_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -591,6 +639,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -607,6 +659,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -638,6 +694,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -652,6 +710,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -763,6 +823,9 @@ entry: define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-LABEL: flat_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -774,6 +837,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -786,6 +853,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -809,6 +880,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -820,6 +893,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -903,6 +978,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-LABEL: flat_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -914,6 +992,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -926,6 +1008,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -949,6 +1035,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -960,6 +1048,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1043,6 +1133,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-LABEL: flat_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1054,6 +1147,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1066,6 +1163,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1089,6 +1190,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1100,6 +1203,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1183,6 +1288,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-LABEL: flat_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1194,6 +1302,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1206,6 +1318,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1229,6 +1345,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1240,6 +1358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1323,6 +1443,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1335,6 +1458,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1348,6 +1475,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1373,6 +1504,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1385,6 +1518,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1477,6 +1612,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1489,6 +1627,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1502,6 +1644,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1527,6 +1673,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1539,6 +1687,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1631,6 +1781,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1643,6 +1796,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1656,6 +1813,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1681,6 +1842,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1693,6 +1856,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1785,6 +1950,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1797,6 +1965,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1810,6 +1982,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1835,6 +2011,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1847,6 +2025,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1939,6 +2119,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1951,6 +2134,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1964,6 +2151,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1989,6 +2180,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2001,6 +2194,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2093,6 +2288,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2108,6 +2306,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2124,6 +2326,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2155,6 +2361,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2169,6 +2377,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2278,6 +2488,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2293,6 +2506,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2309,6 +2526,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2340,6 +2561,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2354,6 +2577,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2463,6 +2688,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2478,6 +2706,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2494,6 +2726,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2525,6 +2761,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2539,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2648,6 +2888,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2673,6 +2916,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2698,6 +2945,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2748,6 +2999,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2763,6 +3016,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2881,6 +3136,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2906,6 +3164,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2931,6 +3193,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2981,6 +3247,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2996,6 +3264,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3114,6 +3384,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3139,6 +3412,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3164,6 +3441,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3214,6 +3495,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3229,6 +3512,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3347,6 +3632,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3372,6 +3660,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3397,6 +3689,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3447,6 +3743,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3462,6 +3760,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3580,6 +3880,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3605,6 +3908,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3630,6 +3937,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3680,6 +3991,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3695,6 +4008,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3813,6 +4128,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3838,6 +4156,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3863,6 +4185,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3913,6 +4239,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3928,6 +4256,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4046,6 +4376,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4071,6 +4404,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4096,6 +4433,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4146,6 +4487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4161,6 +4504,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4279,6 +4624,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4304,6 +4652,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4329,6 +4681,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4379,6 +4735,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4394,6 +4752,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4512,6 +4872,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4537,6 +4900,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4562,6 +4929,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4612,6 +4983,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4627,6 +5000,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4745,6 +5120,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4770,6 +5148,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4795,6 +5177,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4845,6 +5231,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4860,6 +5248,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4978,6 +5368,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5003,6 +5396,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5028,6 +5425,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5078,6 +5479,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5093,6 +5496,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5211,6 +5616,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5236,6 +5644,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5261,6 +5673,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5311,6 +5727,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5326,6 +5744,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5444,6 +5864,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5469,6 +5892,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5494,6 +5921,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5544,6 +5975,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5559,6 +5992,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5677,6 +6112,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5702,6 +6140,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5727,6 +6169,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5777,6 +6223,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5792,6 +6240,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5910,6 +6360,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5935,6 +6388,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5960,6 +6417,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6010,6 +6471,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6025,6 +6488,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6143,6 +6608,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6171,6 +6639,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6199,6 +6671,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6255,6 +6731,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6273,6 +6751,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6418,6 +6898,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6446,6 +6929,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6474,6 +6961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6530,6 +7021,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6548,6 +7041,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6693,6 +7188,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6721,6 +7219,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6749,6 +7251,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6805,6 +7311,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6823,6 +7331,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6968,6 +7478,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6996,6 +7509,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7024,6 +7541,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7080,6 +7601,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7098,6 +7621,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7243,6 +7768,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7271,6 +7799,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7299,6 +7831,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7355,6 +7891,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7373,6 +7911,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7518,6 +8058,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7546,6 +8089,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7574,6 +8121,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7630,6 +8181,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7648,6 +8201,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7793,6 +8348,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7821,6 +8379,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7849,6 +8411,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7905,6 +8471,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7923,6 +8491,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8068,6 +8638,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8096,6 +8669,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8124,6 +8701,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8180,6 +8761,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8198,6 +8781,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8343,6 +8928,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8371,6 +8959,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8399,6 +8991,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8455,6 +9051,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8473,6 +9071,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8618,6 +9218,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8646,6 +9249,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8674,6 +9281,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8730,6 +9341,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8748,6 +9361,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8893,6 +9508,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8921,6 +9539,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8949,6 +9571,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9005,6 +9631,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9023,6 +9651,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9168,6 +9798,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9196,6 +9829,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9224,6 +9861,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9280,6 +9921,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9298,6 +9941,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9443,6 +10088,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9471,6 +10119,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9499,6 +10151,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9555,6 +10211,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9573,6 +10231,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9718,6 +10378,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9746,6 +10409,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9774,6 +10441,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9830,6 +10501,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9848,6 +10521,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9993,6 +10668,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10021,6 +10699,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10049,6 +10731,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10105,6 +10791,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10123,6 +10811,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10268,6 +10958,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX7-LABEL: flat_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10283,6 +10976,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10299,6 +10996,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10330,6 +11031,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10344,6 +11047,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10455,6 +11160,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10470,6 +11178,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10486,6 +11198,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10517,6 +11233,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10531,6 +11249,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10642,6 +11362,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-LABEL: flat_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10657,6 +11380,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10673,6 +11400,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10704,6 +11435,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10718,6 +11451,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10829,6 +11564,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10844,6 +11582,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10860,6 +11602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10891,6 +11637,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10905,6 +11653,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11016,6 +11766,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11027,6 +11780,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11039,6 +11796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11062,6 +11823,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11073,6 +11836,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11156,6 +11921,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11167,6 +11935,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11179,6 +11951,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11202,6 +11978,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11213,6 +11991,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11296,6 +12076,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-LABEL: flat_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11307,6 +12090,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11319,6 +12106,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11342,6 +12133,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11353,6 +12146,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11436,6 +12231,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11447,6 +12245,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11459,6 +12261,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11482,6 +12288,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11493,6 +12301,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11576,6 +12386,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11588,6 +12401,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11601,6 +12418,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11626,6 +12447,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11638,6 +12461,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11730,6 +12555,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11742,6 +12570,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11755,6 +12587,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11780,6 +12616,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11792,6 +12630,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11884,6 +12724,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11896,6 +12739,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11909,6 +12756,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11934,6 +12785,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11946,6 +12799,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12038,6 +12893,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12050,6 +12908,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12063,6 +12925,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12088,6 +12954,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12100,6 +12968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12192,6 +13062,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12204,6 +13077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12217,6 +13094,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12242,6 +13123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12254,6 +13137,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12346,6 +13231,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12361,6 +13249,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12377,6 +13269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12408,6 +13304,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12422,6 +13320,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12531,6 +13431,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12546,6 +13449,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12562,6 +13469,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12593,6 +13504,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12607,6 +13520,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12716,6 +13631,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12731,6 +13649,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12747,6 +13669,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12778,6 +13704,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12792,6 +13720,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12901,6 +13831,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12926,6 +13859,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12951,6 +13888,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13001,6 +13942,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13016,6 +13959,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13134,6 +14079,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13159,6 +14107,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13184,6 +14136,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13234,6 +14190,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13249,6 +14207,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13367,6 +14327,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13392,6 +14355,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13417,6 +14384,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13467,6 +14438,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13482,6 +14455,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13600,6 +14575,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13625,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13650,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13700,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13715,6 +14703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13833,6 +14823,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13858,6 +14851,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13883,6 +14880,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13933,6 +14934,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13948,6 +14951,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14066,6 +15071,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14091,6 +15099,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14116,6 +15128,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14166,6 +15182,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14181,6 +15199,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14299,6 +15319,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14324,6 +15347,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14349,6 +15376,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14399,6 +15430,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14414,6 +15447,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14532,6 +15567,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14557,6 +15595,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14582,6 +15624,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14632,6 +15678,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14647,6 +15695,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14765,6 +15815,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14790,6 +15843,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14815,6 +15872,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14865,6 +15926,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14880,6 +15943,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14998,6 +16063,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15023,6 +16091,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15048,6 +16120,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15098,6 +16174,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15113,6 +16191,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15231,6 +16311,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15256,6 +16339,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15281,6 +16368,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15331,6 +16422,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15346,6 +16439,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15464,6 +16559,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15489,6 +16587,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15514,6 +16616,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15564,6 +16670,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15579,6 +16687,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15697,6 +16807,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15722,6 +16835,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15747,6 +16864,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15797,6 +16918,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15812,6 +16935,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15930,6 +17055,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15955,6 +17083,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15980,6 +17112,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16030,6 +17166,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16045,6 +17183,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16163,6 +17303,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16188,6 +17331,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16213,6 +17360,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16263,6 +17414,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16278,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16396,6 +17551,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16424,6 +17582,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16452,6 +17614,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16508,6 +17674,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16526,6 +17694,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16671,6 +17841,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16699,6 +17872,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16727,6 +17904,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16783,6 +17964,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16801,6 +17984,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16946,6 +18131,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16974,6 +18162,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17002,6 +18194,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17058,6 +18254,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17076,6 +18274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17221,6 +18421,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17249,6 +18452,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17277,6 +18484,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17333,6 +18544,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17351,6 +18564,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17496,6 +18711,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17524,6 +18742,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17552,6 +18774,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17608,6 +18834,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17626,6 +18854,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17771,6 +19001,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17799,6 +19032,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17827,6 +19064,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17883,6 +19124,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17901,6 +19144,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18046,6 +19291,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18074,6 +19322,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18102,6 +19354,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18158,6 +19414,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18176,6 +19434,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18321,6 +19581,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18349,6 +19612,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18377,6 +19644,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18433,6 +19704,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18451,6 +19724,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18596,6 +19871,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18624,6 +19902,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18652,6 +19934,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18708,6 +19994,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18726,6 +20014,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18871,6 +20161,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18899,6 +20192,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18927,6 +20224,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18983,6 +20284,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19001,6 +20304,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19146,6 +20451,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19174,6 +20482,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19202,6 +20514,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19258,6 +20574,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19276,6 +20594,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19421,6 +20741,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19449,6 +20772,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19477,6 +20804,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19533,6 +20864,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19551,6 +20884,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19696,6 +21031,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19724,6 +21062,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19752,6 +21094,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19808,6 +21154,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19826,6 +21174,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19971,6 +21321,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19999,6 +21352,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20027,6 +21384,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20083,6 +21444,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20101,6 +21464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 5ddabad7374dd..a350394bcafe5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -202,6 +217,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX7-LABEL: flat_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -389,6 +419,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-LABEL: flat_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -405,6 +438,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -422,6 +459,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -455,6 +496,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -470,6 +513,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -588,6 +633,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-LABEL: flat_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -605,6 +653,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -624,6 +676,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -659,6 +715,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -675,6 +733,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -806,6 +866,9 @@ entry: define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-LABEL: flat_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -817,6 +880,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -829,6 +896,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -852,6 +923,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -863,6 +936,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -946,6 +1021,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-LABEL: flat_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -957,6 +1035,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -969,6 +1051,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -992,6 +1078,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1003,6 +1091,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1086,6 +1176,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-LABEL: flat_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1098,6 +1191,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1112,6 +1209,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1137,6 +1238,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1149,6 +1252,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1244,6 +1349,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-LABEL: flat_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,6 +1364,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1270,6 +1382,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1295,6 +1411,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1307,6 +1425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1402,6 +1522,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1414,6 +1537,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1427,6 +1554,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1452,6 +1583,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1464,6 +1597,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1556,6 +1691,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1569,6 +1707,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1585,6 +1727,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1612,6 +1758,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1625,6 +1773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1729,6 +1879,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1742,6 +1895,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1757,6 +1914,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1784,6 +1945,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1797,6 +1960,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1901,6 +2066,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1915,6 +2083,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1933,6 +2105,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1962,6 +2138,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1976,6 +2154,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2092,6 +2272,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2106,6 +2289,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2124,6 +2311,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2153,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2167,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2283,6 +2478,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2299,6 +2497,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2316,6 +2518,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2349,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2364,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2480,6 +2690,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2497,6 +2710,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2516,6 +2733,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2551,6 +2772,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2567,6 +2790,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2697,6 +2922,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2714,6 +2942,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2733,6 +2965,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2768,6 +3004,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2784,6 +3022,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2914,6 +3154,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2939,6 +3182,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2964,6 +3211,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3014,6 +3265,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3029,6 +3282,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3147,6 +3402,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3173,6 +3431,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3201,6 +3463,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3253,6 +3519,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3269,6 +3537,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3399,6 +3669,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3425,6 +3698,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3452,6 +3729,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3504,6 +3785,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3520,6 +3803,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3650,6 +3935,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3677,6 +3965,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3707,6 +3999,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3761,6 +4057,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3778,6 +4076,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3920,6 +4220,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3947,6 +4250,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3977,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4031,6 +4342,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4048,6 +4361,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4190,6 +4505,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4216,6 +4534,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4244,6 +4566,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4296,6 +4622,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4312,6 +4640,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4442,6 +4772,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4468,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4496,6 +4833,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4548,6 +4889,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4564,6 +4907,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4694,6 +5039,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4721,6 +5069,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4751,6 +5103,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4805,6 +5161,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4822,6 +5180,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4964,6 +5324,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4991,6 +5354,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5021,6 +5388,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5075,6 +5446,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5092,6 +5465,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5234,6 +5609,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5261,6 +5639,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5291,6 +5673,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5345,6 +5731,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5362,6 +5750,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5504,6 +5894,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5531,6 +5924,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5615,6 +6016,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5632,6 +6035,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5774,6 +6179,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5802,6 +6210,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -5830,6 +6242,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -5886,6 +6302,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5904,6 +6322,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6049,6 +6469,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6078,6 +6501,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6107,6 +6534,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6165,6 +6596,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6184,6 +6617,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6336,6 +6771,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6365,6 +6803,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6395,6 +6837,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6453,6 +6899,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6472,6 +6920,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6629,6 +7079,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6659,6 +7112,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6690,6 +7147,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6750,6 +7211,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6770,6 +7233,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6936,6 +7401,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6966,6 +7434,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6997,6 +7469,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7057,6 +7533,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7077,6 +7555,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7243,6 +7723,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7272,6 +7755,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7301,6 +7788,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7359,6 +7850,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7378,6 +7871,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7532,6 +8027,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7561,6 +8059,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7590,6 +8092,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7648,6 +8154,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7667,6 +8175,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7819,6 +8329,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7849,6 +8362,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7880,6 +8397,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7940,6 +8461,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7960,6 +8483,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8126,6 +8651,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8156,6 +8684,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8187,6 +8719,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8247,6 +8783,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8267,6 +8805,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8433,6 +8973,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8463,6 +9006,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8494,6 +9041,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8554,6 +9105,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8574,6 +9127,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8740,6 +9295,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8770,6 +9328,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8801,6 +9363,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8861,6 +9427,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8881,6 +9449,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9047,6 +9617,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9077,6 +9650,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9108,6 +9685,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9168,6 +9749,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9188,6 +9771,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9352,6 +9937,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9382,6 +9970,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9413,6 +10005,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9473,6 +10069,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9493,6 +10091,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9659,6 +10259,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9689,6 +10292,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9720,6 +10327,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9780,6 +10391,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9800,6 +10413,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9966,6 +10581,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9996,6 +10614,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10027,6 +10649,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10087,6 +10713,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10107,6 +10735,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10273,6 +10903,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX7-LABEL: flat_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10288,6 +10921,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10304,6 +10941,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10335,6 +10976,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10349,6 +10992,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10460,6 +11105,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10475,6 +11123,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10491,6 +11143,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10522,6 +11178,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10536,6 +11194,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10647,6 +11307,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-LABEL: flat_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10662,6 +11325,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10680,6 +11347,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10711,6 +11382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10725,6 +11398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -10842,6 +11517,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10857,6 +11535,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10877,6 +11559,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10908,6 +11594,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10922,6 +11610,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11049,6 +11739,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-LABEL: flat_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11060,6 +11753,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11072,6 +11769,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11095,6 +11796,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11106,6 +11809,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11189,6 +11894,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11200,6 +11908,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11212,6 +11924,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11235,6 +11951,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11246,6 +11964,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11329,6 +12049,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-LABEL: flat_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11340,6 +12063,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11354,6 +12081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11377,6 +12108,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11388,6 +12121,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11480,6 +12215,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11491,6 +12229,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11505,6 +12247,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11528,6 +12274,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11539,6 +12287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11631,6 +12381,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11643,6 +12396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11656,6 +12413,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11681,6 +12442,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11693,6 +12456,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11785,6 +12550,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11797,6 +12565,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11812,6 +12584,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11837,6 +12613,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11849,6 +12627,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11949,6 +12729,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11961,6 +12744,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11976,6 +12763,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12001,6 +12792,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12013,6 +12806,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12114,6 +12909,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12126,6 +12924,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12143,6 +12945,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12168,6 +12974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12180,6 +12988,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12289,6 +13099,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12301,6 +13114,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12318,6 +13135,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12343,6 +13164,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12355,6 +13178,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12464,6 +13289,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12479,6 +13307,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12497,6 +13329,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12528,6 +13364,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12542,6 +13380,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12657,6 +13497,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12672,6 +13515,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12692,6 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12723,6 +13574,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12737,6 +13590,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12863,6 +13718,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12878,6 +13736,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12898,6 +13760,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12929,6 +13795,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -12943,6 +13811,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13069,6 +13939,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13094,6 +13967,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13119,6 +13996,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13169,6 +14050,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13184,6 +14067,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13302,6 +14187,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13327,6 +14215,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13354,6 +14246,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13404,6 +14300,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13419,6 +14317,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13545,6 +14445,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13570,6 +14473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13597,6 +14504,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13647,6 +14558,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13662,6 +14575,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13789,6 +14704,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13814,6 +14732,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13843,6 +14765,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13893,6 +14819,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13908,6 +14836,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14043,6 +14973,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14068,6 +15001,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14097,6 +15034,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14147,6 +15088,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14162,6 +15105,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14297,6 +15242,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14322,6 +15270,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14349,6 +15301,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14399,6 +15355,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14414,6 +15372,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14540,6 +15500,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14565,6 +15528,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14592,6 +15559,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14642,6 +15613,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14657,6 +15630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14783,6 +15758,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14808,6 +15786,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14837,6 +15819,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14887,6 +15873,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14902,6 +15890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15037,6 +16027,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15062,6 +16055,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15091,6 +16088,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15141,6 +16142,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15156,6 +16159,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15291,6 +16296,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15316,6 +16324,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15345,6 +16357,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15395,6 +16411,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15410,6 +16428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15545,6 +16565,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15570,6 +16593,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15599,6 +16626,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15649,6 +16680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15664,6 +16697,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15799,6 +16834,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15824,6 +16862,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15853,6 +16895,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15903,6 +16949,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15918,6 +16966,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16053,6 +17103,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16078,6 +17131,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16107,6 +17164,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16157,6 +17218,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16172,6 +17235,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16307,6 +17372,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16332,6 +17400,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16361,6 +17433,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16411,6 +17487,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16426,6 +17504,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16561,6 +17641,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16586,6 +17669,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16615,6 +17702,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16665,6 +17756,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16680,6 +17773,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16815,6 +17910,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16843,6 +17941,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16871,6 +17973,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16927,6 +18033,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16945,6 +18053,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17090,6 +18200,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17118,6 +18231,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17148,6 +18265,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17204,6 +18325,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17222,6 +18345,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17373,6 +18498,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17401,6 +18529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17431,6 +18563,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17487,6 +18623,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17505,6 +18643,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17659,6 +18799,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17687,6 +18830,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17719,6 +18866,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17775,6 +18926,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17793,6 +18946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17955,6 +19110,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17983,6 +19141,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18015,6 +19177,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18071,6 +19237,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18089,6 +19257,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18251,6 +19421,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18279,6 +19452,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18309,6 +19486,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18365,6 +19546,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18383,6 +19566,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18536,6 +19721,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18564,6 +19752,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18594,6 +19786,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18650,6 +19846,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18668,6 +19866,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18819,6 +20019,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18847,6 +20050,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18879,6 +20086,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18935,6 +20146,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18953,6 +20166,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19115,6 +20330,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19143,6 +20361,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19175,6 +20397,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19231,6 +20457,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19249,6 +20477,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19411,6 +20641,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19439,6 +20672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19471,6 +20708,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19527,6 +20768,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19545,6 +20788,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19707,6 +20952,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19735,6 +20983,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19767,6 +21019,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19823,6 +21079,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19841,6 +21099,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20003,6 +21263,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20031,6 +21294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20063,6 +21330,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20119,6 +21390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20137,6 +21410,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20297,6 +21572,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20325,6 +21603,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20357,6 +21639,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20413,6 +21699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20431,6 +21719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20593,6 +21883,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20621,6 +21914,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20653,6 +21950,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20709,6 +22010,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20727,6 +22030,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20889,6 +22194,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20917,6 +22225,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 +; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20949,6 +22261,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 +; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21005,6 +22321,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21023,6 +22341,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 8a02ad5dfdb7b..8d14f92d9806e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -40,6 +40,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX7-LABEL: global_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -238,6 +241,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX7-LABEL: global_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -437,6 +443,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX7-LABEL: global_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -652,6 +661,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX7-LABEL: global_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -876,6 +888,9 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX7-LABEL: global_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1045,6 +1060,9 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX7-LABEL: global_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1215,6 +1233,9 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX7-LABEL: global_agent_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1411,6 +1432,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX7-LABEL: global_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1606,6 +1630,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1766,6 +1793,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1952,6 +1982,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX7-LABEL: global_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2139,6 +2172,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2353,6 +2389,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2567,6 +2606,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,6 +2812,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3003,6 +3048,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3236,6 +3284,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3464,6 +3515,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3718,6 +3772,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3973,6 +4030,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4255,6 +4315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4536,6 +4599,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4791,6 +4857,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5047,6 +5116,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5329,6 +5401,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5611,6 +5686,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5893,6 +5971,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6175,6 +6256,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6457,6 +6541,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6739,6 +6826,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7021,6 +7111,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7303,6 +7396,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7561,6 +7657,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7835,6 +7934,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8120,6 +8222,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8425,6 +8530,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8729,6 +8837,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9007,6 +9118,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9282,6 +9396,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9587,6 +9704,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9892,6 +10012,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10197,6 +10320,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10502,6 +10628,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10803,6 +10932,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11108,6 +11240,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11413,6 +11548,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11716,6 +11854,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX7-LABEL: global_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11914,6 +12055,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX7-LABEL: global_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12113,6 +12257,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX7-LABEL: global_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12328,6 +12475,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12552,6 +12702,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX7-LABEL: global_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12721,6 +12874,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX7-LABEL: global_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12891,6 +13047,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX7-LABEL: global_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13087,6 +13246,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13282,6 +13444,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13442,6 +13607,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13628,6 +13796,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13815,6 +13986,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14029,6 +14203,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14243,6 +14420,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14446,6 +14626,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14679,6 +14862,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14912,6 +15098,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15140,6 +15329,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15394,6 +15586,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15649,6 +15844,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15931,6 +16129,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16212,6 +16413,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16467,6 +16671,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16723,6 +16930,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17005,6 +17215,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17287,6 +17500,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17569,6 +17785,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17851,6 +18070,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18133,6 +18355,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18415,6 +18640,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18697,6 +18925,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18979,6 +19210,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19237,6 +19471,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19512,6 +19749,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19817,6 +20057,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20121,6 +20364,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20399,6 +20645,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20674,6 +20923,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20979,6 +21231,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21284,6 +21539,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21589,6 +21847,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21894,6 +22155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22195,6 +22459,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22500,6 +22767,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22805,6 +23075,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 14f1734235673..1069cb6f0135d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -35,6 +35,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX7-LABEL: global_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -242,6 +245,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX7-LABEL: global_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -506,6 +512,9 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX7-LABEL: global_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -708,6 +717,9 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX7-LABEL: global_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -961,6 +973,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX7-LABEL: global_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 33aaeebf658dd..bf4d77ad61c6b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -40,6 +40,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX7-LABEL: global_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -238,6 +241,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX7-LABEL: global_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -436,6 +442,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX7-LABEL: global_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -634,6 +643,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -825,6 +837,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX7-LABEL: global_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1009,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX7-LABEL: global_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1163,6 +1181,9 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX7-LABEL: global_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,6 +1353,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,6 +1525,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1659,6 +1686,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1817,6 +1847,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1975,6 +2008,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2133,6 +2169,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2293,6 +2332,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2479,6 +2521,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2665,6 +2710,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2853,6 +2901,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3079,6 +3130,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3305,6 +3359,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3531,6 +3588,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3757,6 +3817,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3983,6 +4046,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4209,6 +4275,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4435,6 +4504,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4661,6 +4733,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4887,6 +4962,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5113,6 +5191,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5339,6 +5420,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5565,6 +5649,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5791,6 +5878,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6017,6 +6107,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6246,6 +6339,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6503,6 +6599,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6760,6 +6859,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7017,6 +7119,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7274,6 +7379,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7531,6 +7639,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7788,6 +7899,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8045,6 +8159,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8302,6 +8419,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8559,6 +8679,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8816,6 +8939,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9073,6 +9199,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9330,6 +9459,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9587,6 +9719,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9844,6 +9979,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10101,6 +10239,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10299,6 +10440,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10497,6 +10641,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10695,6 +10842,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10886,6 +11036,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11055,6 +11208,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11224,6 +11380,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX7-LABEL: global_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11393,6 +11552,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11562,6 +11724,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11720,6 +11885,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11878,6 +12046,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12036,6 +12207,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12194,6 +12368,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12354,6 +12531,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12540,6 +12720,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12726,6 +12909,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12914,6 +13100,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13140,6 +13329,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13366,6 +13558,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13592,6 +13787,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13818,6 +14016,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14044,6 +14245,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14270,6 +14474,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14496,6 +14703,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14722,6 +14932,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14948,6 +15161,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15174,6 +15390,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15400,6 +15619,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15626,6 +15848,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15852,6 +16077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16078,6 +16306,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16307,6 +16538,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16564,6 +16798,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16821,6 +17058,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17078,6 +17318,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17335,6 +17578,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17592,6 +17838,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17849,6 +18098,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18106,6 +18358,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18363,6 +18618,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18620,6 +18878,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18877,6 +19138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19134,6 +19398,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19391,6 +19658,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19648,6 +19918,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19905,6 +20178,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index ae5ec082024fd..ecef93dcd84d9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -40,6 +40,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX7-LABEL: global_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -238,6 +241,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX7-LABEL: global_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -437,6 +443,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX7-LABEL: global_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -654,6 +663,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX7-LABEL: global_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -880,6 +892,9 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX7-LABEL: global_system_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1049,6 +1064,9 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX7-LABEL: global_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1219,6 +1237,9 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX7-LABEL: global_system_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1417,6 +1438,9 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX7-LABEL: global_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1614,6 +1638,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1774,6 +1801,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1962,6 +1992,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX7-LABEL: global_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2151,6 +2184,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2369,6 +2405,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2587,6 +2626,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2792,6 +2834,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3029,6 +3074,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3266,6 +3314,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3494,6 +3545,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3750,6 +3804,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4007,6 +4064,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4293,6 +4353,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4578,6 +4641,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4835,6 +4901,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5093,6 +5162,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5379,6 +5451,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5665,6 +5740,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5951,6 +6029,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6237,6 +6318,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6495,6 +6579,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6772,6 +6859,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7081,6 +7171,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7389,6 +7482,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7669,6 +7765,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7946,6 +8045,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8255,6 +8357,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8564,6 +8669,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8873,6 +8981,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9182,6 +9293,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9487,6 +9601,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9796,6 +9913,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10105,6 +10225,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10412,6 +10535,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX7-LABEL: global_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10610,6 +10736,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX7-LABEL: global_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10809,6 +10938,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX7-LABEL: global_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11026,6 +11158,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX7-LABEL: global_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11252,6 +11387,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX7-LABEL: global_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11421,6 +11559,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX7-LABEL: global_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11591,6 +11732,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX7-LABEL: global_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11789,6 +11933,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX7-LABEL: global_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11986,6 +12133,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12146,6 +12296,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12334,6 +12487,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12523,6 +12679,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12741,6 +12900,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12959,6 +13121,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13164,6 +13329,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13401,6 +13569,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13638,6 +13809,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13866,6 +14040,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14122,6 +14299,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14379,6 +14559,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14665,6 +14848,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14950,6 +15136,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15207,6 +15396,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15465,6 +15657,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15751,6 +15946,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16037,6 +16235,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16323,6 +16524,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16609,6 +16813,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16895,6 +17102,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17181,6 +17391,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17467,6 +17680,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17753,6 +17969,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18011,6 +18230,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18287,6 +18509,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18574,6 +18799,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18883,6 +19111,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19191,6 +19422,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19471,6 +19705,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19748,6 +19985,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20057,6 +20297,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20366,6 +20609,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20675,6 +20921,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20984,6 +21233,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21289,6 +21541,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21598,6 +21853,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21907,6 +22165,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index d916ff533e77b..eea71794e549e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -36,6 +36,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX7-LABEL: global_volatile_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -192,6 +195,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX7-LABEL: global_volatile_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -389,6 +395,9 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX7-LABEL: global_volatile_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -554,6 +563,9 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX7-LABEL: global_volatile_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -757,6 +769,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: global_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -900,6 +915,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX7-LABEL: global_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index aaa11c0455606..95bc4ddd0cff7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -40,6 +40,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX7-LABEL: global_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -238,6 +241,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX7-LABEL: global_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -436,6 +442,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX7-LABEL: global_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -634,6 +643,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -825,6 +837,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX7-LABEL: global_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1009,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX7-LABEL: global_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1163,6 +1181,9 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX7-LABEL: global_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,6 +1353,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,6 +1525,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1659,6 +1686,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1817,6 +1847,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1975,6 +2008,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2133,6 +2169,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2293,6 +2332,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2479,6 +2521,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2665,6 +2710,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2853,6 +2901,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3079,6 +3130,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3305,6 +3359,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3531,6 +3588,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3757,6 +3817,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3983,6 +4046,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4209,6 +4275,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4435,6 +4504,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4661,6 +4733,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4887,6 +4962,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5113,6 +5191,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5339,6 +5420,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5565,6 +5649,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5791,6 +5878,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6017,6 +6107,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6246,6 +6339,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6503,6 +6599,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6760,6 +6859,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7017,6 +7119,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7274,6 +7379,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7531,6 +7639,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7788,6 +7899,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8045,6 +8159,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8302,6 +8419,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8559,6 +8679,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8816,6 +8939,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9073,6 +9199,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9330,6 +9459,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9587,6 +9719,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9844,6 +9979,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10101,6 +10239,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10299,6 +10440,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10497,6 +10641,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10695,6 +10842,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10886,6 +11036,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11055,6 +11208,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11224,6 +11380,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX7-LABEL: global_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11393,6 +11552,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11562,6 +11724,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11720,6 +11885,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11878,6 +12046,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12036,6 +12207,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12194,6 +12368,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12354,6 +12531,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12540,6 +12720,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12726,6 +12909,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12914,6 +13100,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13140,6 +13329,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13366,6 +13558,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13592,6 +13787,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13818,6 +14016,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14044,6 +14245,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14270,6 +14474,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14496,6 +14703,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14722,6 +14932,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14948,6 +15161,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15174,6 +15390,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15400,6 +15619,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15626,6 +15848,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15852,6 +16077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16078,6 +16306,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16307,6 +16538,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16564,6 +16798,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16821,6 +17058,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17078,6 +17318,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17335,6 +17578,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17592,6 +17838,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17849,6 +18098,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18106,6 +18358,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18363,6 +18618,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18620,6 +18878,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18877,6 +19138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19134,6 +19398,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19391,6 +19658,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19648,6 +19918,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19905,6 +20178,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 26511f079fa8f..28c6f255e86a8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -40,6 +40,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX7-LABEL: global_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -238,6 +241,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX7-LABEL: global_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -436,6 +442,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX7-LABEL: global_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -640,6 +649,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -847,6 +859,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX7-LABEL: global_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,6 +1031,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX7-LABEL: global_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1186,6 +1204,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX7-LABEL: global_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1374,6 +1395,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,6 +1585,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1719,6 +1746,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1888,6 +1918,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2065,6 +2098,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2252,6 +2288,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2440,6 +2479,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2632,6 +2674,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2844,6 +2889,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3057,6 +3105,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3283,6 +3334,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3520,6 +3574,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3765,6 +3822,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4020,6 +4080,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4274,6 +4337,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4510,6 +4576,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4747,6 +4816,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5002,6 +5074,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5257,6 +5332,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5512,6 +5590,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5767,6 +5848,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6022,6 +6106,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6277,6 +6364,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6532,6 +6622,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6789,6 +6882,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7046,6 +7142,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7309,6 +7408,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7585,6 +7687,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7868,6 +7973,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8150,6 +8258,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8414,6 +8525,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8677,6 +8791,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8960,6 +9077,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9243,6 +9363,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9526,6 +9649,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9809,6 +9935,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10090,6 +10219,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10373,6 +10505,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10656,6 +10791,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10938,6 +11076,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11136,6 +11277,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11334,6 +11478,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11537,6 +11684,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11741,6 +11891,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11910,6 +12063,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12079,6 +12235,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX7-LABEL: global_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12259,6 +12418,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12439,6 +12601,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12597,6 +12762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12765,6 +12933,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12934,6 +13105,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13113,6 +13287,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13294,6 +13471,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13485,6 +13665,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13689,6 +13872,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13895,6 +14081,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14121,6 +14310,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14357,6 +14549,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14594,6 +14789,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14841,6 +15039,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15088,6 +15289,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15324,6 +15528,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15560,6 +15767,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15807,6 +16017,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16054,6 +16267,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16301,6 +16517,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16548,6 +16767,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16795,6 +17017,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17042,6 +17267,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17289,6 +17517,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17539,6 +17770,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17796,6 +18030,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18058,6 +18295,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18326,6 +18566,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18601,6 +18844,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18876,6 +19122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19140,6 +19389,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19402,6 +19654,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19677,6 +19932,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19952,6 +20210,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20227,6 +20488,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20502,6 +20766,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20775,6 +21042,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21050,6 +21320,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21325,6 +21598,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index fce60ff12aed3..24598bcbea5bb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -37,6 +37,9 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX7-LABEL: local_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -240,6 +243,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX7-LABEL: local_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -895,6 +901,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX7-LABEL: local_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index a8f7051bd5050..455834572a59a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -33,6 +33,9 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX7-LABEL: local_volatile_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -180,6 +183,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX7-LABEL: local_volatile_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index c3599c87985be..55c40ee491100 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -37,7 +37,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX7-LABEL: private_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-WGP-LABEL: private_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -69,7 +72,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-CU-LABEL: private_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -110,7 +113,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -126,7 +129,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -248,7 +251,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX7-LABEL: private_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -266,7 +272,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-WGP-LABEL: private_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 @@ -284,7 +290,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-CU-LABEL: private_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 @@ -329,7 +335,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 @@ -350,7 +356,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 @@ -507,7 +513,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX7-LABEL: private_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -522,7 +528,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-WGP-LABEL: private_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -538,7 +544,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-CU-LABEL: private_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -573,7 +579,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -589,7 +595,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -705,7 +711,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX7-LABEL: private_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -722,7 +728,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-WGP-LABEL: private_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -738,7 +744,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-CU-LABEL: private_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -775,7 +781,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -794,7 +800,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -945,7 +951,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX7-LABEL: private_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -961,7 +970,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: private_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -977,7 +986,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: private_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1018,7 +1027,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1034,7 +1043,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 9146f175eefcd..cdb1b463ac321 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -37,7 +37,10 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX7-LABEL: private_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-WGP-LABEL: private_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -69,7 +72,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-CU-LABEL: private_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -198,7 +201,10 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX7-LABEL: private_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -216,7 +222,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-WGP-LABEL: private_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 @@ -234,7 +240,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-CU-LABEL: private_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 @@ -386,7 +392,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX7-LABEL: private_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -402,7 +408,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-WGP-LABEL: private_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -419,7 +425,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-CU-LABEL: private_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -549,7 +555,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX7-LABEL: private_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -567,7 +573,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-WGP-LABEL: private_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -584,7 +590,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-CU-LABEL: private_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index d070dc3b770f8..1fd311f225db4 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" } attributes #6 = { "amdgpu-flat-work-group-size"="512,512" } attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index f62f1d57aec8e..9577f2a932f96 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" } attributes #18 = { "amdgpu-waves-per-eu"="9,10" } attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll index c1d647c5d3b90..9758afa3b4991 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll @@ -19,5 +19,5 @@ define void @hoge() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 4420833029d46..8792e60bb0ca1 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() { !1 = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index 59036c64c8afc..a51b0128a3a4a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s13 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index cca7b49996ff3..183cdb2e1f862 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -80,8 +80,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll index 049db01badacf..2ab48479fa1b0 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll index c9387f196dff9..cc58d34a8b255 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll @@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 { attributes #0 = { "uniform-work-group-size"="true" } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll index 7183da2c5efc3..33298bde89b97 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 { attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll index 6ed04cf63d20b..6fe85cd7c9e65 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 { attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll index d5ba2fd617c6e..f6ab402e01232 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -52,8 +52,8 @@ attributes #0 = { nounwind } attributes #1 = { "uniform-work-group-size"="false" } attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll index 7f0dfeaf75c80..37e3376ef9d50 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 { attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll index 8616c73ad51c1..7f83686bc756e 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 { attributes #0 = { "uniform-work-group-size"="false" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index a827ebe96cfcf..43a948b6c6ab2 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 s0, s0, s13 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 From 7eacaf22e1de95f1c45214c228b78ebe7ad11827 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 16 Aug 2024 10:43:34 -0700 Subject: [PATCH 02/13] (1) Use getCalledFunction instead of getCalledOperand (2) other minor code change based on reviews (3) fix test files. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 22 +- .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 368 ++++++++++++-- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 390 ++++++++++++-- .../AMDGPU/GlobalISel/extractelement.ll | 69 +-- ...licit-kernarg-backend-usage-global-isel.ll | 50 +- .../GlobalISel/insertelement-stack-lower.ll | 2 +- .../AMDGPU/GlobalISel/lds-global-value.ll | 5 +- .../GlobalISel/llvm.amdgcn.if.break.i64.ll | 3 + .../GlobalISel/llvm.amdgcn.trig.preop.ll | 24 + .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 33 ++ .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 33 ++ .../abi-attribute-hints-undefined-behavior.ll | 18 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 41 +- llvm/test/CodeGen/AMDGPU/always-uniform.ll | 3 + ...amdgpu-codegenprepare-fold-binop-select.ll | 10 +- .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +- .../annotate-kernel-features-hsa-call.ll | 2 +- .../AMDGPU/annotate-kernel-features-hsa.ll | 30 +- .../attr-amdgpu-flat-work-group-size.ll | 4 +- .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 6 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 4 +- .../attributor-flatscratchinit-globalisel.ll | 2 +- .../AMDGPU/attributor-flatscratchinit.ll | 257 +++++----- .../AMDGPU/call-graph-register-usage.ll | 8 +- llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 12 +- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 3 + ...dagcomb-extract-vec-elt-different-sizes.ll | 2 + ...cannot-create-empty-or-backward-segment.ll | 2 +- .../expand-scalar-carry-out-select-user.ll | 3 + llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 66 +++ .../fast-unaligned-load-store.global.ll | 20 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 236 ++++++++- .../flat-for-global-subtarget-feature.ll | 7 +- llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 105 ++-- .../AMDGPU/fmul-2-combine-multi-use.ll | 48 ++ llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 60 +++ .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 3 + llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 62 ++- llvm/test/CodeGen/AMDGPU/half.ll | 231 +++++++++ .../AMDGPU/hsa-metadata-kernel-code-props.ll | 9 +- llvm/test/CodeGen/AMDGPU/hsa.ll | 4 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 50 +- .../indirect-call-set-from-other-function.ll | 2 +- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 36 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 58 ++- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 +++++++- llvm/test/CodeGen/AMDGPU/ipra.ll | 2 +- llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 2 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 248 ++++++--- .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 37 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 70 ++- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 114 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 126 ++++- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 6 + llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 127 ++++- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 85 +++- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 18 + llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 164 +++++- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 129 ++++- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 83 ++- llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 3 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 16 +- .../AMDGPU/lower-module-lds-via-table.ll | 15 +- .../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +- .../match-perm-extract-vector-elt-bug.ll | 8 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 20 +- llvm/test/CodeGen/AMDGPU/min.ll | 211 +++++++- llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 21 + llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 18 + ...al-regcopy-and-spill-missed-at-regalloc.ll | 61 ++- .../AMDGPU/propagate-flat-work-group-size.ll | 2 +- .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 4 +- .../AMDGPU/remove-no-kernel-id-attribute.ll | 8 +- llvm/test/CodeGen/AMDGPU/sad.ll | 68 ++- .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 16 + .../scc-clobbered-sgpr-to-vmem-spill.ll | 474 +++++++++--------- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 24 +- .../CodeGen/AMDGPU/simple-indirect-call-2.ll | 10 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 70 ++- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 8 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 6 + llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 2 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 2 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 9 +- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 16 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 45 ++ llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 91 +++- ...ine-function-info-long-branch-reg-debug.ll | 7 +- .../machine-function-info-long-branch-reg.ll | 7 +- 89 files changed, 4176 insertions(+), 900 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 76eba1aa9ffcd..3ef0694046553 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -440,17 +440,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return; } - bool HasAllocaOrASCast = false; - for (BasicBlock &BB : *F) { - for (Instruction &I : BB) { - if (isa(I) || isa(I)) { - HasAllocaOrASCast = true; - removeAssumedBits(FLAT_SCRATCH_INIT); - break; - } + for (Instruction &I : instructions(F)) { + if (isa(I) || isa(I)) { + removeAssumedBits(FLAT_SCRATCH_INIT); + return; } - if (HasAllocaOrASCast) - break; } } @@ -707,13 +701,12 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // no-flat-scratch-init. auto CheckForNoFlatScratchInit = [&](Instruction &I) { const auto &CB = cast(I); - const Value *CalleeOp = CB.getCalledOperand(); - const Function *Callee = dyn_cast(CalleeOp); + const Function *Callee = CB.getCalledFunction(); if (!Callee) // indirect call return CB.isInlineAsm(); if (Callee->isIntrinsic()) - return true; + return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull; const auto *CalleeInfo = A.getAAFor( *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); @@ -722,7 +715,8 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { bool UsedAssumedInformation = false; // If any callee is false (i.e. need FlatScratchInit), - // checkForAllCallLikeInstructions returns false + // checkForAllCallLikeInstructions returns false, in which case this + // function returns true. return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this, UsedAssumedInformation); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index 705bcbddf227a..cb64c25b5f080 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -99,11 +105,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -114,11 +123,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -291,6 +303,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -306,6 +321,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -365,6 +383,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -382,6 +403,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -444,6 +468,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -461,6 +488,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -523,6 +553,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -535,6 +568,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -585,6 +621,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -599,6 +638,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -652,6 +694,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -666,6 +711,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -720,7 +768,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -728,6 +778,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -742,7 +793,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -750,6 +803,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -814,6 +868,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -831,6 +888,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -890,6 +950,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -905,6 +968,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -920,6 +986,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -934,6 +1002,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -970,6 +1042,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -987,6 +1062,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1004,6 +1082,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1018,6 +1098,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1057,6 +1141,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1074,6 +1161,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1091,6 +1181,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1105,6 +1197,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1144,6 +1240,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1156,6 +1255,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1168,6 +1270,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1179,6 +1283,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1211,6 +1319,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1225,6 +1336,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1239,6 +1353,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1250,6 +1366,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,6 +1405,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1299,6 +1422,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1313,6 +1439,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1324,6 +1452,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1360,7 +1492,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1368,6 +1502,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -1382,7 +1517,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1390,6 +1527,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1404,6 +1542,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1422,6 +1562,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -1478,6 +1622,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1495,6 +1642,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1512,6 +1662,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1525,6 +1677,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1571,10 +1727,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1592,10 +1751,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1613,7 +1775,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1628,6 +1792,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1666,12 +1834,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1689,12 +1860,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1712,7 +1886,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1727,6 +1903,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1768,10 +1948,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1781,10 +1964,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1794,7 +1980,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1806,6 +1994,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1840,12 +2032,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1855,12 +2050,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1870,7 +2068,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1882,6 +2082,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1919,12 +2123,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1934,12 +2141,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1949,7 +2159,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1961,6 +2173,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1999,6 +2215,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2025,6 +2244,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2051,12 +2273,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2070,6 +2294,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2128,6 +2356,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2146,6 +2377,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2164,12 +2398,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2178,6 +2414,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2231,8 +2471,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2249,8 +2492,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2326,7 +2572,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2342,7 +2591,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2410,7 +2662,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2426,7 +2681,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2612,10 +2870,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2628,10 +2889,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2691,12 +2955,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2709,12 +2976,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2775,12 +3045,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2793,12 +3066,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2859,10 +3135,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2872,10 +3151,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2926,12 +3208,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2941,12 +3226,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2998,12 +3286,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3013,12 +3304,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3071,6 +3365,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3094,6 +3391,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3170,6 +3470,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3188,6 +3491,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3258,7 +3564,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -3277,7 +3586,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index b3a7e65f771c4..00ff2d7a35d56 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -99,11 +105,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -114,11 +123,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -291,6 +303,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -306,6 +321,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -365,6 +383,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -382,6 +403,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -444,6 +468,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -461,6 +488,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -523,6 +553,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -535,6 +568,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -585,6 +621,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -599,6 +638,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -652,6 +694,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -666,6 +711,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -720,7 +768,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -728,6 +778,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -742,7 +793,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -750,6 +803,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -814,6 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -831,6 +888,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -896,8 +956,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -914,8 +977,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +1057,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1007,7 +1076,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1075,7 +1147,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1091,7 +1166,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1277,10 +1355,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1293,10 +1374,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1356,12 +1440,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1374,12 +1461,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1440,12 +1530,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1458,12 +1551,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1524,10 +1620,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1537,10 +1636,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1591,12 +1693,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1606,12 +1711,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1663,12 +1771,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1678,12 +1789,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1736,6 +1850,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1759,6 +1876,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1835,6 +1955,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1853,6 +1976,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1916,6 +2042,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1931,6 +2060,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1946,6 +2078,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1960,6 +2094,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,6 +2134,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2013,6 +2154,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2030,6 +2174,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2044,6 +2190,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2083,6 +2233,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2100,6 +2253,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2117,6 +2273,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2131,6 +2289,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2170,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2182,6 +2347,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2194,6 +2362,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2205,6 +2375,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2237,6 +2411,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2251,6 +2428,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2265,6 +2445,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2276,6 +2458,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2311,6 +2497,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2325,6 +2514,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2339,6 +2531,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2350,6 +2544,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2386,7 +2584,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2394,6 +2594,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -2408,7 +2609,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2416,6 +2619,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -2430,6 +2634,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2448,6 +2654,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -2504,6 +2714,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2521,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2538,6 +2754,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2551,6 +2769,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2604,7 +2826,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -2623,7 +2848,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 @@ -2696,10 +2924,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2717,10 +2948,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2738,7 +2972,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2753,6 +2989,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2791,12 +3031,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2814,12 +3057,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2837,7 +3083,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2852,6 +3100,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2893,12 +3145,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2916,12 +3171,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2939,7 +3197,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2954,6 +3214,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2995,10 +3259,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3008,10 +3275,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3021,7 +3291,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3033,6 +3305,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3067,12 +3343,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3082,12 +3361,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3097,7 +3379,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3109,6 +3393,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3146,12 +3434,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3161,12 +3452,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3176,7 +3470,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3188,6 +3484,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3226,6 +3526,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3252,6 +3555,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3278,12 +3584,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3297,6 +3605,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3355,6 +3667,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3373,6 +3688,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3391,12 +3709,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3405,6 +3725,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3455,6 +3779,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -3462,6 +3787,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3475,6 +3802,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -3482,6 +3810,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 34efb089b72bf..868b530e42a21 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3037,7 +3037,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 12 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3052,7 +3052,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3069,7 +3069,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 13 +; GPRIDX-NEXT: wavefront_sgpr_count = 15 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -3128,7 +3128,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 12 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3143,7 +3143,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3160,7 +3160,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 9 +; MOVREL-NEXT: wavefront_sgpr_count = 22 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -3178,21 +3178,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 +; MOVREL-NEXT: s_add_i32 s10, s10, s15 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s2, 0 -; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3220,7 +3223,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 12 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3235,7 +3238,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4054,7 +4057,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 12 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4069,7 +4072,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4086,7 +4089,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 12 +; GPRIDX-NEXT: wavefront_sgpr_count = 14 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4127,7 +4130,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4138,7 +4141,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 12 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4153,7 +4156,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4170,7 +4173,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 8 +; MOVREL-NEXT: wavefront_sgpr_count = 22 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4188,6 +4191,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s2, s[6:7], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_add_i32 s10, s10, s15 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4223,7 +4229,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 12 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4238,7 +4244,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4401,7 +4407,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 12 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4416,7 +4422,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4433,7 +4439,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 13 +; GPRIDX-NEXT: wavefront_sgpr_count = 15 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4477,7 +4483,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4488,7 +4494,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 12 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4503,7 +4509,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4520,7 +4526,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 9 +; MOVREL-NEXT: wavefront_sgpr_count = 22 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4538,10 +4544,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_add_i32 s10, s10, s15 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; MOVREL-NEXT: s_mov_b32 s2, 0 -; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] @@ -4549,6 +4557,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4576,7 +4585,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 12 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4591,7 +4600,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 80e9ae33d6d45..70b889389ff99 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -10,11 +10,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 -; GFX8V4-NEXT: s_add_i32 s8, s8, s11 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s9 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 ; GFX8V4-NEXT: s_mov_b32 s5, s3 @@ -38,11 +38,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 -; GFX8V5-NEXT: s_add_i32 s6, s6, s9 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 @@ -65,9 +65,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) @@ -91,9 +91,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) @@ -127,6 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -140,6 +143,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -183,6 +189,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -196,6 +205,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -279,7 +291,10 @@ define amdgpu_kernel void @llvm_debugtrap() { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 ; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc @@ -305,6 +320,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8V5-NEXT: s_add_u32 s0, s6, 8 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 4fcde0f2fc7cf..7aa3b5bb10990 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[20:23], s[6:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s13 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index c3938e673a6da..6ed2df430998f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace( ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_add_i32 s10, s10, s15 ; CHECK-NEXT: ds_read_b32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 9 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index d7a82b415ff06..3a90c3ee90803 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[6:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xa +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index f3654fea486e0..96fd14f52d13b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 @@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s10, s10, s15 +; GFX10-NEXT: s_addc_u32 s11, s11, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 7d7f450e590fa..f0ec0d101f5be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s8, s5, 31 ; GFX8-NEXT: s_add_i32 s0, s5, s8 @@ -145,6 +148,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 @@ -616,6 +622,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 @@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2188,6 +2203,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2333,6 +2351,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 @@ -2597,6 +2618,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2742,6 +2766,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 @@ -3003,6 +3030,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -3154,6 +3184,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 5aef667934709..2be04ace99e36 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX8-NEXT: s_sub_i32 s0, 0, s5 @@ -112,6 +115,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 @@ -522,6 +528,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 @@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -979,7 +991,10 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s10, s10, s15 ; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 @@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s4, 16 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -2193,7 +2217,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index e53653408feb4..b8ffa4f14c3e5 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workitem_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1 ; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0 @@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6 +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_endpgm @@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 { define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 { ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 @@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) # define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 { ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr: ; FIXEDABI: ; %bb.0: +; FIXEDABI-NEXT: s_add_i32 s4, s4, s9 ; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 7336543b41cbc..9534561e6e280 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -63,23 +63,16 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 { ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: -; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} - -; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 -; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 - -; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base - -; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 -; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 - -; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}} +; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}} +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]] +; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] +; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] +; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 +; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen ; HSA: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .amdhsa_user_sgpr_dispatch_ptr 0 @@ -259,8 +252,11 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { ; FIXME: Shouldn't need to enable queue ptr ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: -; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] +; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] +; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]] define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { @@ -281,7 +277,12 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { ; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast: -; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] +; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] +; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 0a461f9ee6c96..e0c69706bad79 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s0, s0, s4 @@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index de318e7ae31a5..89c5303e0e81d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -393,11 +393,15 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: v_mov_b32_e32 v0, 0x83 +; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 -; GCN-NEXT: s_movk_i32 s0, 0x80 -; GCN-NEXT: s_cselect_b32 s0, s0, 0x83 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm %select = select i1 %cond, i16 5, i16 8 diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index a6d8c6f41eee5..54a800ecee9f1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 77 -; TRAP-HANDLER-DISABLE: NumSgprs: 92 +; TRAP-HANDLER-ENABLE: NumSgprs: 83 +; TRAP-HANDLER-DISABLE: NumSgprs: 98 define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out0, i32 %in0, ptr addrspace(1) %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 23294eacbe6cb..ab2e28e5f5cbb 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -751,7 +751,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index e30e013e3e3d8..ac5458f56f08b 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -478,11 +478,16 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 { ; No-op addrspacecast should not use queue ptr define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast -; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr -; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast +; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] { +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof @@ -490,11 +495,16 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt } define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast -; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { -; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr -; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast +; AKF_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; AKF_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] { +; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(4) %ptr to ptr %ld = load volatile i32, ptr %stof diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index fc13b86566f76..22cc5af30da66 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -35,9 +35,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_1024 -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 2{{$}} +; CHECK: NumSGPRsForWavesPerEU: 24{{$}} ; CHECK: NumVGPRsForWavesPerEU: 43 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_1024() #3 { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index ed045107d354d..3ddf8be052e4a 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -4,8 +4,8 @@ ; ALL-LABEL: {{^}}max_10_sgprs: -; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 10 +; ALL: SGPRBlocks: 2 +; ALL: NumSGPRsForWavesPerEU: 22 define amdgpu_kernel void @max_10_sgprs() #0 { %one = load volatile i32, ptr addrspace(4) undef %two = load volatile i32, ptr addrspace(4) undef @@ -125,7 +125,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1 declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1 -attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } +attributes #0 = { nounwind "amdgpu-num-sgpr"="18" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 14519f5a5e77c..26dc3e13a72cf 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 3 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 20 +; CHECK: NumSGPRsForWavesPerEU: 26 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll index ce5a3eedb5ebb..f9f48aacfc2bf 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -1,5 +1,5 @@ ; Test the generation of the attribute amdgpu-no-flat-scratch-init -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s ;; tests of alloca diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index c0d700cc37464..6d9da9281211c 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -1,6 +1,6 @@ ; Test the generation of the attribute amdgpu-no-flat-scratch-init -; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ;; tests of alloca @@ -200,10 +200,10 @@ define amdgpu_kernel void @call_with_alloca_cc_kernel() { define void @call_both_with_and_without_alloca() { ; GFX9-LABEL: define void @call_both_with_and_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @call_both_with_and_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] call void @with_alloca() call void @without_alloca() ret void @@ -211,10 +211,10 @@ define void @call_both_with_and_without_alloca() { define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] call void @with_alloca() call void @without_alloca() ret void @@ -235,7 +235,7 @@ define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { ; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] call void @call_without_alloca() ret void } @@ -262,10 +262,10 @@ define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { define void @with_alloca_call_without_alloca() { ; GFX9-LABEL: define void @with_alloca_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @with_alloca_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @without_alloca() @@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() { define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @without_alloca() @@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { define void @with_alloca_call_call_without_alloca() { ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_without_alloca() @@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() { define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_without_alloca() @@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] store volatile i32 0, ptr addrspace(2) %ptr ret void } define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] store volatile i32 0, ptr addrspace(2) %ptr ret void } define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] %stof = addrspacecast ptr addrspace(2) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] %stof = addrspacecast ptr addrspace(2) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] store volatile i32 0, ptr addrspace(3) %ptr ret void } define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] store volatile i32 0, ptr addrspace(3) %ptr ret void } define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -486,20 +486,20 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI5:[0-9]+]] store volatile i32 0, ptr addrspace(4) %ptr ret void } define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI6:[0-9]+]] store volatile i32 0, ptr addrspace(4) %ptr ret void } @@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] store volatile i32 0, ptr addrspace(5) %ptr ret void } define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] store volatile i32 0, ptr addrspace(5) %ptr ret void } define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -632,70 +632,78 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } + + + + + + + + define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -704,10 +712,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace( define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -716,10 +724,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -728,10 +736,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -764,10 +772,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] call void @without_alloca(i1 true) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -775,10 +783,10 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] call void @without_alloca(i1 true) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -834,10 +842,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x() define void @use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI7:[0-9]+]] ; ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI7:[0-9]+]] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) undef ret void @@ -856,59 +864,84 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { define void @call_use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI7:[0-9]+]] ; ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI7:[0-9]+]] call void @use_intrinsic_workitem_id_x() ret void } define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] call void @use_intrinsic_workitem_id_x() ret void } -; GFX9: attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -;GFX9: attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; GFX9: attributes #[[ATTR_GFX9_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; GFX9: attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 0d66ea55a0437..51fff3444324f 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -33,8 +33,8 @@ define void @indirect_use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: -; CI: ; NumSgprs: 36 -; VI-NOBUG: ; NumSgprs: 36 +; CI: ; NumSgprs: 38 +; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { @@ -121,8 +121,8 @@ define void @indirect_use_80_sgpr() #1 { } ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: -; CI: ; NumSgprs: 82 -; VI-NOBUG: ; NumSgprs: 82 +; CI: ; NumSgprs: 84 +; VI-NOBUG: ; NumSgprs: 86 ; VI-BUG: ; NumSgprs: 96 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 3035a8579c8a6..48ad2fe687804 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -12,13 +12,13 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 -; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 +; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel ; OSABI-AMDHSA-ASM: .text @@ -31,13 +31,13 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 -; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 +; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel ; OSABI-AMDHSA-ASM: .text diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index 5fbcd0bf66999..c97d333800602 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,6 +5,9 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-NEXT: s_add_i32 s10, s10, s15 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 297fe7618672e..f586f6d16e0ef 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,6 +6,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 85ed2914b8c7f..39328d706ba26 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: s_load_dword s14, s[6:7], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s13 +; CHECK-NEXT: s_add_u32 s24, s24, s15 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index 54fb1dc5c0527..05a245cd3443c 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb ; GFX7-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 21799ab79b839..7fde702bd49ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -13,6 +13,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -25,6 +28,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -68,6 +74,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -80,6 +89,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -122,6 +134,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -134,6 +149,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -175,6 +193,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -188,6 +209,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -232,6 +256,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -249,6 +276,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -295,6 +325,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -309,6 +342,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -358,6 +394,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -370,6 +409,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -415,6 +457,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -441,6 +486,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -499,9 +547,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_lshr_b32 s2, s4, 16 @@ -527,9 +578,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -589,6 +643,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -612,6 +669,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -682,6 +742,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -700,6 +763,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 7252c69cb1cf7..4df6b8d066915 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s10, s10, s15 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s10, s10, s15 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -220,8 +226,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s10, s10, s15 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -231,6 +239,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 @@ -247,6 +256,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s10, s10, s15 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -359,6 +371,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s10, s10, s15 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -369,6 +384,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s10, s10, s15 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index f0ce96af90649..6482749bd2fb7 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -80,6 +83,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -91,6 +97,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -140,6 +149,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -196,6 +208,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -253,6 +268,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -309,6 +327,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -352,6 +373,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -395,6 +419,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -441,6 +468,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -485,6 +515,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -529,6 +562,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -573,6 +609,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -616,10 +655,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -664,10 +706,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -712,10 +757,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -760,6 +808,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -804,6 +855,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -850,6 +904,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -894,6 +951,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -938,6 +998,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -982,6 +1045,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1026,6 +1092,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1070,6 +1139,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1114,6 +1186,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1158,6 +1233,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1202,6 +1280,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1257,6 +1338,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1267,6 +1351,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1313,6 +1400,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1369,6 +1459,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1426,6 +1519,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1482,10 +1578,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1531,10 +1630,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1578,10 +1680,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1623,10 +1728,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1668,10 +1776,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1713,10 +1824,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1762,10 +1876,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1810,10 +1927,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1857,10 +1977,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1905,10 +2028,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1950,10 +2076,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1995,10 +2124,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2040,10 +2172,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2085,10 +2220,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2130,10 +2268,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2175,10 +2316,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_add_i32 s10, s10, s15 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2221,6 +2365,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2238,6 +2385,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2305,6 +2455,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2322,6 +2475,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2389,6 +2545,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2407,6 +2566,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2475,6 +2637,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2498,6 +2663,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2568,6 +2736,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2585,6 +2756,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2652,6 +2826,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2669,6 +2846,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2737,6 +2917,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2755,6 +2938,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2824,6 +3010,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2847,6 +3036,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2916,6 +3108,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: s_add_i32 s10, s10, s15 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -2933,6 +3128,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll index fee6540f43c64..ab00b132f2fd1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -6,18 +6,13 @@ ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s -; There are no stack objects even though flat is used by default, so -; flat_scratch_init should be disabled. - ; ALL-LABEL: {{^}}test: -; ALL-NOT: flat_scr - ; HSA-DEFAULT: flat_store_dword ; HSA-NODEFAULT: buffer_store_dword ; HSA-NOADDR64: flat_store_dword -; HSA: .amdhsa_user_sgpr_flat_scratch_init 0 +; HSA: .amdhsa_user_sgpr_flat_scratch_init 1 ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 02eb1ad945329..134b790238086 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=FIJI-NOXNACK -check-prefix=GCN %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s @@ -8,16 +8,16 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: @@ -25,11 +25,15 @@ ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 8 -; VI-NOXNACK: ; TotalNumSgprs: 8 -; VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 +; CI: ; NumSgprs: 8 +; VI-NOXNACK: ; NumSgprs: 8 +; VI-XNACK: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 8 +; HSA-VI-XNACK: ; NumSgprs: 12 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -42,11 +46,15 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 10 -; VI-NOXNACK: ; TotalNumSgprs: 10 -; VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 +; CI: ; NumSgprs: 10 +; VI-NOXNACK: ; NumSgprs: 10 +; VI-XNACK: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 10 +; HSA-VI-XNACK: ; NumSgprs: 12 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -59,11 +67,16 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 12 -; VI-NOXNACK: ; TotalNumSgprs: 14 -; VI-XNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 +; CI: ; NumSgprs: 12 +; FIJI-NOXNACK: ; NumSgprs: 14 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-VI-NOXNACK: ; NumSgprs: 24 +; HSA-VI-XNACK: ; NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -76,11 +89,13 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 12 -; VI-NOXNACK: ; TotalNumSgprs: 14 -; VI-XNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -96,11 +111,13 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: TotalNumSgprs: 4 -; VI-NOXNACK: TotalNumSgprs: 6 -; VI-XNACK: TotalNumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -113,11 +130,13 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: TotalNumSgprs: 4 -; VI-NOXNACK: TotalNumSgprs: 6 -; VI-XNACK: TotalNumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -130,11 +149,13 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: TotalNumSgprs: 4 -; VI-NOXNACK: TotalNumSgprs: 6 -; VI-XNACK: TotalNumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c60b9858abd83..23b453438f3a0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -82,8 +85,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-NEXT: s_load_dword s3, s[6:7], 0x2c +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: v_add_f32_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -143,6 +149,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -200,6 +209,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 @@ -263,6 +275,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -313,10 +328,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -362,6 +380,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -380,6 +401,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -498,6 +522,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -519,6 +546,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -619,6 +649,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -640,6 +673,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 @@ -742,6 +778,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -749,6 +787,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1 ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-DENORM-NEXT: s_add_u32 s4, s2, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 ; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0 @@ -765,6 +804,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -772,6 +813,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0 @@ -875,6 +917,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -928,10 +973,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 8267bb9f5450f..c4f13749251a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -8,6 +8,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -25,6 +28,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -71,6 +77,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -89,6 +98,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -139,6 +151,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -151,6 +166,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -195,6 +213,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -207,6 +228,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -249,6 +273,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -294,6 +321,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 @@ -316,7 +346,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -325,6 +357,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -369,6 +402,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -381,6 +417,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -423,6 +462,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -468,6 +510,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| @@ -489,7 +534,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -497,6 +544,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -538,6 +586,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -555,6 +606,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -611,6 +665,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -635,7 +692,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s1, s4, 16 @@ -644,6 +703,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff ; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 98b17bbaa0a95..7137834b6552e 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1478,6 +1478,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bitcmp1_b32 s8, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1489,6 +1491,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 40982347f3ca0..6e22c9c319f69 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -10,6 +10,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -22,6 +25,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -66,6 +72,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -80,6 +89,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -129,6 +141,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -141,6 +156,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -183,6 +201,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -200,6 +221,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -246,6 +270,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -258,6 +285,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -299,14 +329,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND ; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm ; @@ -349,6 +382,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -363,6 +399,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -412,6 +451,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -424,6 +466,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -466,6 +511,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -492,6 +540,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -539,6 +590,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -560,6 +614,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -624,6 +681,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 3735c6349fbb3..6207c442f41ee 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -48,6 +54,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -59,6 +68,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -85,6 +97,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -120,6 +135,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -147,6 +165,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -161,6 +182,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -193,6 +217,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -206,6 +233,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -239,6 +269,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -250,6 +283,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -279,6 +315,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -292,6 +331,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -324,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -337,6 +382,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -369,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -384,6 +435,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -421,6 +475,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -449,6 +506,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -507,6 +567,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -520,6 +583,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -553,6 +619,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -569,6 +638,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -608,6 +680,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -629,6 +704,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -676,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -701,6 +782,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -756,6 +840,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -803,6 +890,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -890,6 +980,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -920,6 +1013,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -950,6 +1046,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -980,6 +1079,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1010,6 +1112,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1043,6 +1148,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1059,6 +1167,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1096,6 +1207,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1113,6 +1227,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1152,6 +1269,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1171,6 +1291,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1213,6 +1336,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1243,6 +1369,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1301,6 +1430,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1359,6 +1491,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1458,6 +1593,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1494,6 +1632,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1512,6 +1653,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1554,6 +1698,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1579,6 +1726,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1632,6 +1782,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1660,6 +1813,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1719,6 +1875,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,6 +1926,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1853,6 +2015,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1948,6 +2113,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2103,6 +2271,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2136,6 +2307,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2153,6 +2327,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2191,6 +2368,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2214,6 +2394,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2261,6 +2444,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2282,6 +2468,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2326,6 +2515,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2361,6 +2553,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2426,6 +2621,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2495,6 +2693,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2607,6 +2808,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -2624,6 +2828,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2656,6 +2863,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2677,6 +2887,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2710,6 +2923,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2747,6 +2963,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2787,6 +3006,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2845,6 +3067,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2909,6 +3134,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2940,6 +3168,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 8c017fa5ec263..51b0e2b86cdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -15,7 +15,10 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; CHECK: .sgpr_count: 10 +; GFX700: .sgpr_count: 22 +; GFX803: .sgpr_count: 24 +; GFX900: .sgpr_count: 10 +; GFX1010: .sgpr_count: 10 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 @@ -48,8 +51,8 @@ entry: ; CHECK: .name: num_spilled_sgprs ; GFX700: .sgpr_spill_count: 10 -; GFX803: .sgpr_spill_count: 10 -; GFX900: .sgpr_spill_count: 62 +; GFX803: .sgpr_spill_count: 0 +; GFX900: .sgpr_spill_count: 0 ; GFX1010: .sgpr_spill_count: 60 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 37476203fbfad..2c38e201d326f 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -43,7 +43,7 @@ ; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269 ; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978 ; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63 -; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C +; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C ; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C ; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072 ; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370 @@ -59,7 +59,7 @@ ; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172 ; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D ; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB -; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367 +; ELF: 0210: 2E736770 725F636F 756E740C B12E7367 ; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7 ; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E ; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index c998a4b19121e..8f9b223c361d6 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -10,11 +10,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 -; GFX8V4-NEXT: s_add_i32 s8, s8, s11 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s9 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 -; GFX8V5-NEXT: s_add_i32 s6, s6, s9 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0 @@ -62,9 +62,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) @@ -88,9 +88,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) @@ -124,6 +124,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 ; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -136,6 +139,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc ; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -178,6 +184,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 ; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -190,6 +199,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 ; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -273,7 +285,10 @@ define amdgpu_kernel void @llvm_debugtrap() { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 ; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc @@ -298,6 +313,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_i32 s10, s10, s15 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8V5-NEXT: s_add_u32 s0, s6, 8 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll index f419d89a7f0a4..b283a8fca8a39 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll @@ -68,6 +68,6 @@ if.end: ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 292722c2607ad..ff5c71b65f34b 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %12 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %10 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %12 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %10 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -46,16 +46,16 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %12 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %10 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index c68138acc9b2b..cf6ae3d5b4f68 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -530,14 +551,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -609,14 +633,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -686,14 +713,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -765,14 +795,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -848,9 +881,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -942,9 +978,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -1058,9 +1097,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -1237,11 +1279,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1409,11 +1454,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 647870f0e0897..aafa4a04a00de 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -20,6 +20,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -69,6 +72,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -85,6 +91,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -140,6 +149,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -160,6 +172,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -222,6 +237,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -237,6 +255,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -293,6 +314,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -312,6 +336,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -382,6 +409,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -404,6 +434,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -475,6 +508,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -523,6 +559,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -539,6 +578,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -589,6 +631,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -636,6 +681,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_add_i32 s10, s10, s15 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -684,6 +732,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -702,6 +753,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -760,9 +814,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -779,9 +836,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -838,6 +898,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -856,6 +919,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -913,6 +979,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -931,6 +1000,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -987,6 +1059,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1005,6 +1080,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1061,6 +1139,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1079,6 +1160,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1135,6 +1219,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1153,6 +1240,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1209,6 +1299,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1227,6 +1320,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1283,6 +1379,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1301,6 +1400,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1363,6 +1465,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1382,6 +1487,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1446,9 +1554,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1467,9 +1578,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -1535,11 +1649,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1562,11 +1679,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1637,14 +1757,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -1656,9 +1779,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1718,14 +1844,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -1737,9 +1866,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1800,14 +1932,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -1819,9 +1954,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1881,14 +2019,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -1900,9 +2041,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1963,14 +2107,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -1982,9 +2129,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2050,6 +2200,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2077,6 +2230,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2160,9 +2316,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -2186,9 +2345,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2258,9 +2420,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -2278,9 +2443,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2341,9 +2509,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2361,9 +2532,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2455,9 +2629,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2509,9 +2686,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2656,11 +2836,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2683,9 +2866,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 @@ -2761,12 +2947,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2774,6 +2962,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_perm_b32 v3, s4, v3, v12 @@ -2787,11 +2976,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2928,11 +3120,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3025,11 +3220,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index a2f55d6aa8396..758e8b6e1e2ee 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ define hidden void @func() #1 { ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 33 +; GCN: ; NumSgprs: 37 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 496a1c652da25..1a32953305bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,7 +7,7 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 14 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index aebc8315514fb..d51ace630f692 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -113,16 +113,24 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -168,16 +176,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -223,16 +239,24 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -278,16 +302,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_lshl_b32 s4, s15, 2 @@ -320,7 +352,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -334,8 +371,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 0 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -348,15 +385,24 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 4 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -383,7 +429,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -397,8 +448,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 2 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -411,15 +462,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 6 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -446,7 +506,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -460,8 +525,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 1 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -474,15 +539,24 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 5 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] @@ -509,7 +583,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 +; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_add_u32 s8, s6, 8 @@ -523,8 +602,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s15, 3 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_endpgm @@ -537,15 +616,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s0, s0, s7 -; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_add_u32 s8, s4, 8 -; CHECK-NEXT: s_addc_u32 s9, s5, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s10, s10, s15 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: s_add_u32 s8, s6, 8 +; CHECK-NEXT: s_addc_u32 s9, s7, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s15, 7 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: s_getpc_b64 s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 8fadfe3d02666..c998a00727793 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s2, s10, 42 +; GCN-NEXT: s_add_i32 s2, s12, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -42,19 +45,28 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-LABEL: indirect_lds_id: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: s_add_u32 s8, s8, function_lds_id@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s9, s9, function_lds_id@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] +; GCN-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 s8, s6, 8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_addc_u32 s9, s7, 0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, function_lds_id@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 21 -; GCN-NEXT: s_mov_b32 s12, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) ret void @@ -63,6 +75,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 39a3b1c8adc9f..44ea414dd4b93 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -259,6 +259,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -269,6 +272,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -284,10 +290,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -296,10 +305,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -312,10 +324,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -324,11 +339,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -341,12 +359,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -354,12 +375,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -373,25 +397,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -405,13 +435,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -419,13 +452,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -439,13 +475,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -453,13 +492,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 24a332fa211c1..ffceac2c912bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; @@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 9d93ca65683c4..da40a06c306b9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -55,6 +58,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -102,6 +108,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -153,6 +162,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -210,6 +222,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -261,6 +276,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -318,6 +336,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -362,6 +383,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -412,6 +436,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -462,6 +489,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -518,11 +548,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -573,11 +606,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -633,6 +669,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -694,6 +733,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -766,6 +808,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -833,6 +878,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -909,7 +957,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -918,6 +968,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -980,7 +1031,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -990,6 +1043,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1064,15 +1118,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: s_mov_b32 s4, m0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm ; @@ -1119,15 +1176,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: s_mov_b32 s4, m0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm ; @@ -1178,6 +1238,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1222,6 +1285,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1271,6 +1337,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1316,6 +1385,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1367,6 +1439,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1412,6 +1487,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1464,6 +1542,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1503,6 +1584,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1548,10 +1632,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1596,11 +1683,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1649,10 +1739,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1697,11 +1790,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1748,7 +1844,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1782,7 +1881,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1822,11 +1924,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1867,11 +1972,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1917,11 +2025,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1962,11 +2073,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 7202ab8b31466..59cc6dfac1200 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -93,6 +96,9 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 2ee1c60b4bbf2..5d69aa7d679be 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -26,6 +26,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -108,6 +111,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -181,6 +187,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -287,6 +296,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -363,6 +375,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -449,6 +464,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -591,6 +609,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -804,6 +825,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -881,6 +905,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -959,6 +986,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1036,6 +1066,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,6 +1145,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1199,6 +1235,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1290,6 +1329,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1389,6 +1431,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1493,6 +1538,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1601,6 +1649,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1720,6 +1771,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1880,6 +1934,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,6 +2116,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2323,6 +2383,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,6 +2694,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3112,6 +3178,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3683,6 +3752,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4606,6 +4678,9 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5394,6 +5469,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5482,6 +5560,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5569,6 +5650,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5652,6 +5736,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5738,12 +5825,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff @@ -5834,6 +5924,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5939,10 +6032,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 @@ -6076,6 +6172,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6234,10 +6333,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 @@ -6454,6 +6556,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6717,10 +6822,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 @@ -7104,6 +7212,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7582,10 +7693,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16 ; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 @@ -8310,6 +8424,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 4ab55164e0999..6283f6bb3c5e3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -105,6 +108,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -194,6 +200,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -290,6 +299,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -390,6 +402,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -526,6 +541,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 @@ -689,6 +707,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 @@ -861,6 +882,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -1038,6 +1062,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 @@ -1221,6 +1248,9 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1409,6 +1439,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1495,6 +1528,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1587,6 +1623,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1673,6 +1712,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1767,12 +1809,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -1867,6 +1912,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1983,13 +2031,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2118,6 +2169,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2282,8 +2336,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -2291,6 +2347,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 @@ -2491,6 +2548,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2789,6 +2849,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3245,13 +3308,16 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 @@ -3681,6 +3747,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4539,14 +4608,17 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 @@ -5160,6 +5232,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 46c7c2b08cd64..45f0af8d423b6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -97,6 +100,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -183,6 +189,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -299,6 +308,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -430,6 +442,9 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -649,6 +664,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-NEXT: s_add_i32 s10, s10, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index ce17c81a24dd5..43b79973187a6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -114,6 +117,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -199,6 +205,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -311,6 +320,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -382,6 +394,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -458,6 +473,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -541,6 +559,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -618,6 +639,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -696,6 +720,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -773,6 +800,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -854,6 +884,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -955,6 +988,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1054,6 +1090,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1157,6 +1196,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1260,6 +1302,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1366,6 +1411,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1485,6 +1533,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1646,6 +1697,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1830,6 +1884,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2098,6 +2155,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2414,6 +2474,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2898,6 +2961,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3483,6 +3549,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4402,6 +4471,9 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,6 +5283,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5295,6 +5370,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5382,6 +5460,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5464,6 +5545,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5554,6 +5638,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5663,6 +5750,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5778,10 +5868,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 @@ -5918,6 +6011,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6079,10 +6175,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 @@ -6303,6 +6402,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6574,10 +6676,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24 @@ -6968,6 +7073,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7459,10 +7567,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 @@ -8205,6 +8316,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -8979,6 +9093,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9065,6 +9182,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9153,6 +9273,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9239,6 +9362,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9330,6 +9456,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9431,6 +9560,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9545,6 +9677,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9681,6 +9816,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9839,6 +9977,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10043,6 +10184,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10292,6 +10436,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10650,6 +10797,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11099,6 +11249,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -11763,6 +11916,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index e0c2d00891250..6a973d0adeffa 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -350,6 +359,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -435,6 +447,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -523,6 +538,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -673,6 +691,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -822,6 +843,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -907,6 +931,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -995,6 +1022,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1080,6 +1110,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1170,6 +1203,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1269,6 +1305,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1370,6 +1409,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1480,6 +1522,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1597,6 +1642,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1712,6 +1760,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1834,6 +1885,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1983,6 +2037,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2147,6 +2204,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2383,6 +2443,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2654,6 +2717,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3065,6 +3131,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3584,6 +3653,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4388,6 +4460,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5153,6 +5228,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5250,6 +5328,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5345,6 +5426,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5437,6 +5521,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5535,6 +5622,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5644,6 +5734,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5762,6 +5855,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5907,6 +6003,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6067,10 +6166,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6085,8 +6184,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4 @@ -6286,6 +6388,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6536,10 +6641,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6556,7 +6661,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 @@ -6916,6 +7024,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7387,6 +7498,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -8089,6 +8203,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 4d7f1a9663c3d..8322eee826495 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1568,6 +1613,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1670,8 +1718,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1679,6 +1729,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -1796,6 +1847,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1937,8 +1991,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1953,6 +2009,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -2130,6 +2187,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2366,6 +2426,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2727,8 +2790,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2762,6 +2827,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) @@ -3118,6 +3184,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3589,7 +3658,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s13 +; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15 ; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3909,6 +3978,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4433,6 +4505,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 4dfc773d615e4..1a6fa3c518ca7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -13,7 +13,8 @@ ; GCN: s_cselect_b32 ; GCN-NOT: load_dword -; GCN: flat_load_dwordx2 +; GCN: flat_load_dword +; GCN: flat_load_dword ; GCN-NOT: load_dword ; GCN: flat_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 05ad567478675..12df2b0ed9380 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -161,7 +161,10 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -182,6 +185,7 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm + call void @f0() call void @f1() ret void @@ -197,7 +201,10 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -244,7 +251,10 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index 3453ff9d296c0..72a0aceaae12b 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -226,7 +226,10 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -270,7 +273,10 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] @@ -317,7 +323,10 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_add_u32 s0, s0, s7 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index e876a8d9dda69..245a2775d9f2f 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 92536c2078514..e8632871f56ea 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s10 +; GFX9-NEXT: s_mul_i32 s12, s12, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s12 ; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] @@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s10, v0 +; GFX10-NEXT: s_mul_i32 s12, s12, s4 +; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c38..0348737a41a30 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -186,7 +188,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -566,7 +568,9 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 @@ -981,6 +985,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 ; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -1255,6 +1261,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -1432,7 +1440,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -1812,7 +1820,9 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 @@ -2227,6 +2237,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 ; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 05ef2698c1f77..86cdf3ccd0441 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -146,6 +152,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -157,6 +166,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -218,6 +230,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -229,6 +244,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,6 +310,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -310,6 +331,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -422,11 +446,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_byte v[0:1], v2 @@ -437,11 +464,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -559,6 +589,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -582,6 +614,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_or_b32 s2, s3, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -591,7 +624,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 @@ -615,6 +649,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -769,6 +804,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -788,6 +826,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -922,6 +963,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -952,6 +996,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -1050,10 +1097,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1072,10 +1122,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1191,10 +1244,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1213,10 +1269,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1304,6 +1363,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1315,6 +1377,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1378,6 +1443,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1392,6 +1460,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1466,6 +1537,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1478,6 +1552,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1545,6 +1622,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1557,6 +1637,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1636,10 +1719,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1658,10 +1744,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1767,12 +1856,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, s5 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1791,12 +1883,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1930,12 +2025,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -1966,12 +2064,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -2070,6 +2171,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2081,6 +2185,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2155,10 +2262,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2177,10 +2287,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2286,6 +2399,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2307,6 +2423,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2394,6 +2513,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2405,6 +2527,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2488,6 +2613,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2509,6 +2637,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2638,6 +2769,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2660,6 +2794,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2752,6 +2889,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2763,6 +2903,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2834,6 +2977,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2865,6 +3011,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3125,6 +3274,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3171,6 +3323,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3296,11 +3451,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3311,11 +3469,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3405,11 +3566,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3420,11 +3584,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3521,6 +3688,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3535,6 +3705,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3617,6 +3790,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3635,6 +3811,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3727,6 +3906,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3745,6 +3927,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3837,6 +4022,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3855,6 +4043,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3947,6 +4138,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3965,6 +4159,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4081,9 +4278,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -4112,10 +4312,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -4233,9 +4436,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -4263,10 +4469,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 5792fab7011af..d7814c52828b8 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s10, s10, s15 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 529e64715500d..3de6945f95556 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 37d0309caac0a..993eb11ef5e95 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -10,32 +10,36 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7 - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %27 + ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %27 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %24 + ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy ; PEI-GFX908: bb.0 (%ir-block.0): - ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 + ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 ; PEI-GFX908-NEXT: {{ $}} - ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) @@ -44,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -55,31 +59,34 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7 - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %26 + ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %26 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %24 + ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; ; PEI-GFX90A-LABEL: name: partial_copy ; PEI-GFX90A: bb.0 (%ir-block.0): - ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 + ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -87,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index 1fd311f225db4..51e0eb6d49794 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -153,7 +153,7 @@ define internal void @merge_cycle_1() #3 { define amdgpu_kernel void @kernel_64_256() #7 { ; CHECK-LABEL: define {{[^@]+}}@kernel_64_256 -; CHECK-SAME: () #[[ATTR6]] { +; CHECK-SAME: () #[[ATTR8:[0-9]+]] { ; CHECK-NEXT: call void @merge_cycle_0() ; CHECK-NEXT: call void @default_captured_address() ; CHECK-NEXT: call void @externally_visible_default() diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 9577f2a932f96..e911df4aa4da3 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -210,7 +210,7 @@ define void @externally_visible_default() { ; 1,10 -> 3,8 define internal i32 @bitcasted_function() { ; CHECK-LABEL: define internal i32 @bitcasted_function -; CHECK-SAME: () #[[ATTR8]] { +; CHECK-SAME: () #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: ret i32 0 ; ret i32 0 @@ -407,7 +407,7 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 8792e60bb0ca1..667db7b1ebbd0 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() { define internal i16 @mutual_recursion_0(i16 %arg) { ; CHECK-LABEL: define internal i16 @mutual_recursion_0( -; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4 @@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) { define internal void @mutual_recursion_1(i16 %arg) { ; CHECK-LABEL: define internal void @mutual_recursion_1( -; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5]] { ; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]]) ; CHECK-NEXT: ret void ; @@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) { define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion( -; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] { +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] !llvm.amdgcn.lds.kernel.id !9 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ] ; CHECK-NEXT: call void @mutual_recursion_0(i16 0) ; CHECK-NEXT: ret void @@ -196,6 +196,8 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index c2132cf907fdb..847c2d343d415 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -83,7 +92,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -92,6 +103,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -119,12 +131,15 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen @@ -151,7 +166,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 @@ -162,6 +179,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -186,7 +204,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -197,6 +217,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -222,7 +243,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 @@ -233,6 +256,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -255,7 +279,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -264,6 +290,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 @@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_sad_u32 v2, s4, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i16_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i8_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 89a09dc4fcc17..9826585df8bd8 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX906-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX906-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 7f8240eeb98eb..5d0ddcc7114c2 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -24,175 +24,179 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ; def s[48:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 2 -; CHECK-NEXT: v_writelane_b32 v22, s5, 3 -; CHECK-NEXT: v_writelane_b32 v22, s6, 4 -; CHECK-NEXT: v_writelane_b32 v22, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 6 -; CHECK-NEXT: v_writelane_b32 v22, s5, 7 -; CHECK-NEXT: v_writelane_b32 v22, s6, 8 -; CHECK-NEXT: v_writelane_b32 v22, s7, 9 -; CHECK-NEXT: v_writelane_b32 v22, s8, 10 -; CHECK-NEXT: v_writelane_b32 v22, s9, 11 -; CHECK-NEXT: v_writelane_b32 v22, s10, 12 -; CHECK-NEXT: v_writelane_b32 v22, s11, 13 +; CHECK-NEXT: v_writelane_b32 v23, s4, 2 +; CHECK-NEXT: v_writelane_b32 v23, s5, 3 +; CHECK-NEXT: v_writelane_b32 v23, s6, 4 +; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: v_writelane_b32 v23, s8, 6 +; CHECK-NEXT: v_writelane_b32 v23, s9, 7 +; CHECK-NEXT: v_writelane_b32 v23, s10, 8 +; CHECK-NEXT: v_writelane_b32 v23, s11, 9 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 14 -; CHECK-NEXT: v_writelane_b32 v22, s5, 15 -; CHECK-NEXT: v_writelane_b32 v22, s6, 16 -; CHECK-NEXT: v_writelane_b32 v22, s7, 17 -; CHECK-NEXT: v_writelane_b32 v22, s8, 18 -; CHECK-NEXT: v_writelane_b32 v22, s9, 19 -; CHECK-NEXT: v_writelane_b32 v22, s10, 20 -; CHECK-NEXT: v_writelane_b32 v22, s11, 21 -; CHECK-NEXT: v_writelane_b32 v22, s12, 22 -; CHECK-NEXT: v_writelane_b32 v22, s13, 23 -; CHECK-NEXT: v_writelane_b32 v22, s14, 24 -; CHECK-NEXT: v_writelane_b32 v22, s15, 25 -; CHECK-NEXT: v_writelane_b32 v22, s16, 26 -; CHECK-NEXT: v_writelane_b32 v22, s17, 27 -; CHECK-NEXT: v_writelane_b32 v22, s18, 28 -; CHECK-NEXT: v_writelane_b32 v22, s19, 29 +; CHECK-NEXT: v_writelane_b32 v23, s4, 10 +; CHECK-NEXT: v_writelane_b32 v23, s5, 11 +; CHECK-NEXT: v_writelane_b32 v23, s6, 12 +; CHECK-NEXT: v_writelane_b32 v23, s7, 13 +; CHECK-NEXT: v_writelane_b32 v23, s8, 14 +; CHECK-NEXT: v_writelane_b32 v23, s9, 15 +; CHECK-NEXT: v_writelane_b32 v23, s10, 16 +; CHECK-NEXT: v_writelane_b32 v23, s11, 17 +; CHECK-NEXT: v_writelane_b32 v23, s12, 18 +; CHECK-NEXT: v_writelane_b32 v23, s13, 19 +; CHECK-NEXT: v_writelane_b32 v23, s14, 20 +; CHECK-NEXT: v_writelane_b32 v23, s15, 21 +; CHECK-NEXT: v_writelane_b32 v23, s16, 22 +; CHECK-NEXT: v_writelane_b32 v23, s17, 23 +; CHECK-NEXT: v_writelane_b32 v23, s18, 24 +; CHECK-NEXT: v_writelane_b32 v23, s19, 25 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[42:43] +; CHECK-NEXT: ; def s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:55] +; CHECK-NEXT: ; def s[44:47] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 30 -; CHECK-NEXT: v_writelane_b32 v22, s5, 31 -; CHECK-NEXT: v_writelane_b32 v22, s6, 32 -; CHECK-NEXT: v_writelane_b32 v22, s7, 33 -; CHECK-NEXT: v_writelane_b32 v22, s8, 34 -; CHECK-NEXT: v_writelane_b32 v22, s9, 35 -; CHECK-NEXT: v_writelane_b32 v22, s10, 36 -; CHECK-NEXT: v_writelane_b32 v22, s11, 37 +; CHECK-NEXT: v_writelane_b32 v23, s4, 26 +; CHECK-NEXT: v_writelane_b32 v23, s5, 27 +; CHECK-NEXT: v_writelane_b32 v23, s6, 28 +; CHECK-NEXT: v_writelane_b32 v23, s7, 29 +; CHECK-NEXT: v_writelane_b32 v23, s8, 30 +; CHECK-NEXT: v_writelane_b32 v23, s9, 31 +; CHECK-NEXT: v_writelane_b32 v23, s10, 32 +; CHECK-NEXT: v_writelane_b32 v23, s11, 33 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[40:41] +; CHECK-NEXT: ; def s[36:37] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:39] +; CHECK-NEXT: ; def s[40:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:51] +; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 34 +; CHECK-NEXT: v_writelane_b32 v23, s1, 35 +; CHECK-NEXT: v_writelane_b32 v23, s2, 36 +; CHECK-NEXT: v_writelane_b32 v23, s3, 37 +; CHECK-NEXT: v_writelane_b32 v23, s4, 38 +; CHECK-NEXT: v_writelane_b32 v23, s5, 39 +; CHECK-NEXT: v_writelane_b32 v23, s6, 40 +; CHECK-NEXT: v_writelane_b32 v23, s7, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 38 -; CHECK-NEXT: v_writelane_b32 v22, s1, 39 -; CHECK-NEXT: v_writelane_b32 v22, s2, 40 -; CHECK-NEXT: v_writelane_b32 v22, s3, 41 -; CHECK-NEXT: v_writelane_b32 v22, s4, 42 -; CHECK-NEXT: v_writelane_b32 v22, s5, 43 -; CHECK-NEXT: v_writelane_b32 v22, s6, 44 -; CHECK-NEXT: v_writelane_b32 v22, s7, 45 -; CHECK-NEXT: v_writelane_b32 v22, s8, 46 -; CHECK-NEXT: v_writelane_b32 v22, s9, 47 -; CHECK-NEXT: v_writelane_b32 v22, s10, 48 -; CHECK-NEXT: v_writelane_b32 v22, s11, 49 -; CHECK-NEXT: v_writelane_b32 v22, s12, 50 -; CHECK-NEXT: v_writelane_b32 v22, s13, 51 -; CHECK-NEXT: v_writelane_b32 v22, s14, 52 -; CHECK-NEXT: v_writelane_b32 v22, s15, 53 +; CHECK-NEXT: v_writelane_b32 v23, s0, 42 +; CHECK-NEXT: v_writelane_b32 v23, s1, 43 +; CHECK-NEXT: v_writelane_b32 v23, s2, 44 +; CHECK-NEXT: v_writelane_b32 v23, s3, 45 +; CHECK-NEXT: v_writelane_b32 v23, s4, 46 +; CHECK-NEXT: v_writelane_b32 v23, s5, 47 +; CHECK-NEXT: v_writelane_b32 v23, s6, 48 +; CHECK-NEXT: v_writelane_b32 v23, s7, 49 +; CHECK-NEXT: v_writelane_b32 v23, s8, 50 +; CHECK-NEXT: v_writelane_b32 v23, s9, 51 +; CHECK-NEXT: v_writelane_b32 v23, s10, 52 +; CHECK-NEXT: v_writelane_b32 v23, s11, 53 +; CHECK-NEXT: v_writelane_b32 v23, s12, 54 +; CHECK-NEXT: v_writelane_b32 v23, s13, 55 +; CHECK-NEXT: v_writelane_b32 v23, s14, 56 +; CHECK-NEXT: v_writelane_b32 v23, s15, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 54 -; CHECK-NEXT: v_writelane_b32 v22, s1, 55 -; CHECK-NEXT: v_writelane_b32 v22, s2, 56 -; CHECK-NEXT: v_writelane_b32 v22, s3, 57 +; CHECK-NEXT: v_writelane_b32 v23, s0, 58 +; CHECK-NEXT: v_writelane_b32 v23, s1, 59 +; CHECK-NEXT: v_writelane_b32 v23, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 58 -; CHECK-NEXT: v_writelane_b32 v22, s1, 59 -; CHECK-NEXT: v_writelane_b32 v22, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v22, s3, 61 -; CHECK-NEXT: v_writelane_b32 v22, s4, 62 -; CHECK-NEXT: v_writelane_b32 v23, s6, 0 -; CHECK-NEXT: v_writelane_b32 v22, s5, 63 -; CHECK-NEXT: v_writelane_b32 v23, s7, 1 +; CHECK-NEXT: v_writelane_b32 v23, s0, 62 +; CHECK-NEXT: v_writelane_b32 v0, s2, 0 +; CHECK-NEXT: v_writelane_b32 v0, s3, 1 +; CHECK-NEXT: v_writelane_b32 v0, s4, 2 +; CHECK-NEXT: v_writelane_b32 v0, s5, 3 +; CHECK-NEXT: v_writelane_b32 v0, s6, 4 +; CHECK-NEXT: v_writelane_b32 v23, s1, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 2 -; CHECK-NEXT: v_writelane_b32 v23, s1, 3 -; CHECK-NEXT: v_writelane_b32 v23, s2, 4 -; CHECK-NEXT: v_writelane_b32 v23, s3, 5 -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 -; CHECK-NEXT: v_writelane_b32 v23, s12, 14 -; CHECK-NEXT: v_writelane_b32 v23, s13, 15 -; CHECK-NEXT: v_writelane_b32 v23, s14, 16 -; CHECK-NEXT: v_writelane_b32 v23, s15, 17 +; CHECK-NEXT: v_writelane_b32 v0, s0, 6 +; CHECK-NEXT: v_writelane_b32 v0, s1, 7 +; CHECK-NEXT: v_writelane_b32 v0, s2, 8 +; CHECK-NEXT: v_writelane_b32 v0, s3, 9 +; CHECK-NEXT: v_writelane_b32 v0, s4, 10 +; CHECK-NEXT: v_writelane_b32 v0, s5, 11 +; CHECK-NEXT: v_writelane_b32 v0, s6, 12 +; CHECK-NEXT: v_writelane_b32 v0, s7, 13 +; CHECK-NEXT: v_writelane_b32 v0, s8, 14 +; CHECK-NEXT: v_writelane_b32 v0, s9, 15 +; CHECK-NEXT: v_writelane_b32 v0, s10, 16 +; CHECK-NEXT: v_writelane_b32 v0, s11, 17 +; CHECK-NEXT: v_writelane_b32 v0, s12, 18 +; CHECK-NEXT: v_writelane_b32 v0, s13, 19 +; CHECK-NEXT: v_writelane_b32 v0, s14, 20 +; CHECK-NEXT: v_writelane_b32 v0, s15, 21 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 18 -; CHECK-NEXT: v_writelane_b32 v23, s1, 19 +; CHECK-NEXT: v_writelane_b32 v0, s0, 22 +; CHECK-NEXT: v_writelane_b32 v0, s1, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 20 -; CHECK-NEXT: v_writelane_b32 v23, s1, 21 -; CHECK-NEXT: v_writelane_b32 v23, s2, 22 -; CHECK-NEXT: v_writelane_b32 v23, s3, 23 +; CHECK-NEXT: v_writelane_b32 v0, s0, 24 +; CHECK-NEXT: v_writelane_b32 v0, s1, 25 +; CHECK-NEXT: v_writelane_b32 v0, s2, 26 +; CHECK-NEXT: v_writelane_b32 v0, s3, 27 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 24 -; CHECK-NEXT: v_writelane_b32 v23, s1, 25 -; CHECK-NEXT: v_writelane_b32 v23, s2, 26 -; CHECK-NEXT: v_writelane_b32 v23, s3, 27 -; CHECK-NEXT: v_writelane_b32 v23, s4, 28 -; CHECK-NEXT: v_writelane_b32 v23, s5, 29 -; CHECK-NEXT: v_writelane_b32 v23, s6, 30 -; CHECK-NEXT: v_writelane_b32 v23, s7, 31 +; CHECK-NEXT: v_writelane_b32 v0, s0, 28 +; CHECK-NEXT: v_writelane_b32 v0, s1, 29 +; CHECK-NEXT: v_writelane_b32 v0, s2, 30 +; CHECK-NEXT: v_writelane_b32 v0, s3, 31 +; CHECK-NEXT: v_writelane_b32 v0, s4, 32 +; CHECK-NEXT: v_writelane_b32 v0, s5, 33 +; CHECK-NEXT: v_writelane_b32 v0, s6, 34 +; CHECK-NEXT: v_writelane_b32 v0, s7, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 32 -; CHECK-NEXT: v_writelane_b32 v23, s1, 33 -; CHECK-NEXT: v_writelane_b32 v23, s2, 34 -; CHECK-NEXT: v_writelane_b32 v23, s3, 35 -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 -; CHECK-NEXT: v_writelane_b32 v23, s12, 44 -; CHECK-NEXT: v_writelane_b32 v23, s13, 45 -; CHECK-NEXT: v_writelane_b32 v23, s14, 46 -; CHECK-NEXT: v_writelane_b32 v23, s15, 47 +; CHECK-NEXT: v_writelane_b32 v0, s0, 36 +; CHECK-NEXT: v_writelane_b32 v0, s1, 37 +; CHECK-NEXT: v_writelane_b32 v0, s2, 38 +; CHECK-NEXT: v_writelane_b32 v0, s3, 39 +; CHECK-NEXT: v_writelane_b32 v0, s4, 40 +; CHECK-NEXT: v_writelane_b32 v0, s5, 41 +; CHECK-NEXT: v_writelane_b32 v0, s6, 42 +; CHECK-NEXT: v_writelane_b32 v0, s7, 43 +; CHECK-NEXT: v_writelane_b32 v0, s8, 44 +; CHECK-NEXT: v_writelane_b32 v0, s9, 45 +; CHECK-NEXT: v_writelane_b32 v0, s10, 46 +; CHECK-NEXT: v_writelane_b32 v0, s11, 47 +; CHECK-NEXT: v_writelane_b32 v0, s12, 48 +; CHECK-NEXT: v_writelane_b32 v0, s13, 49 +; CHECK-NEXT: v_writelane_b32 v0, s14, 50 +; CHECK-NEXT: v_writelane_b32 v0, s15, 51 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret ; CHECK-NEXT: s_endpgm @@ -202,170 +206,174 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 2 -; CHECK-NEXT: v_readlane_b32 s1, v22, 3 -; CHECK-NEXT: v_readlane_b32 s2, v22, 4 -; CHECK-NEXT: v_readlane_b32 s3, v22, 5 +; CHECK-NEXT: v_readlane_b32 s0, v23, 2 +; CHECK-NEXT: v_readlane_b32 s1, v23, 3 +; CHECK-NEXT: v_readlane_b32 s2, v23, 4 +; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: v_readlane_b32 s4, v23, 6 +; CHECK-NEXT: v_readlane_b32 s5, v23, 7 +; CHECK-NEXT: v_readlane_b32 s6, v23, 8 +; CHECK-NEXT: v_readlane_b32 s7, v23, 9 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[48:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 6 -; CHECK-NEXT: v_readlane_b32 s1, v22, 7 -; CHECK-NEXT: v_readlane_b32 s2, v22, 8 -; CHECK-NEXT: v_readlane_b32 s3, v22, 9 -; CHECK-NEXT: v_readlane_b32 s4, v22, 10 -; CHECK-NEXT: v_readlane_b32 s5, v22, 11 -; CHECK-NEXT: v_readlane_b32 s6, v22, 12 -; CHECK-NEXT: v_readlane_b32 s7, v22, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 14 -; CHECK-NEXT: v_readlane_b32 s1, v22, 15 -; CHECK-NEXT: v_readlane_b32 s2, v22, 16 -; CHECK-NEXT: v_readlane_b32 s3, v22, 17 -; CHECK-NEXT: v_readlane_b32 s4, v22, 18 -; CHECK-NEXT: v_readlane_b32 s5, v22, 19 -; CHECK-NEXT: v_readlane_b32 s6, v22, 20 -; CHECK-NEXT: v_readlane_b32 s7, v22, 21 -; CHECK-NEXT: v_readlane_b32 s8, v22, 22 -; CHECK-NEXT: v_readlane_b32 s9, v22, 23 -; CHECK-NEXT: v_readlane_b32 s10, v22, 24 -; CHECK-NEXT: v_readlane_b32 s11, v22, 25 -; CHECK-NEXT: v_readlane_b32 s12, v22, 26 -; CHECK-NEXT: v_readlane_b32 s13, v22, 27 -; CHECK-NEXT: v_readlane_b32 s14, v22, 28 -; CHECK-NEXT: v_readlane_b32 s15, v22, 29 +; CHECK-NEXT: v_readlane_b32 s0, v23, 10 +; CHECK-NEXT: v_readlane_b32 s1, v23, 11 +; CHECK-NEXT: v_readlane_b32 s2, v23, 12 +; CHECK-NEXT: v_readlane_b32 s3, v23, 13 +; CHECK-NEXT: v_readlane_b32 s4, v23, 14 +; CHECK-NEXT: v_readlane_b32 s5, v23, 15 +; CHECK-NEXT: v_readlane_b32 s6, v23, 16 +; CHECK-NEXT: v_readlane_b32 s7, v23, 17 +; CHECK-NEXT: v_readlane_b32 s8, v23, 18 +; CHECK-NEXT: v_readlane_b32 s9, v23, 19 +; CHECK-NEXT: v_readlane_b32 s10, v23, 20 +; CHECK-NEXT: v_readlane_b32 s11, v23, 21 +; CHECK-NEXT: v_readlane_b32 s12, v23, 22 +; CHECK-NEXT: v_readlane_b32 s13, v23, 23 +; CHECK-NEXT: v_readlane_b32 s14, v23, 24 +; CHECK-NEXT: v_readlane_b32 s15, v23, 25 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 30 -; CHECK-NEXT: v_readlane_b32 s1, v22, 31 -; CHECK-NEXT: v_readlane_b32 s2, v22, 32 -; CHECK-NEXT: v_readlane_b32 s3, v22, 33 -; CHECK-NEXT: v_readlane_b32 s4, v22, 34 -; CHECK-NEXT: v_readlane_b32 s5, v22, 35 -; CHECK-NEXT: v_readlane_b32 s6, v22, 36 -; CHECK-NEXT: v_readlane_b32 s7, v22, 37 +; CHECK-NEXT: v_readlane_b32 s0, v23, 26 +; CHECK-NEXT: v_readlane_b32 s1, v23, 27 +; CHECK-NEXT: v_readlane_b32 s2, v23, 28 +; CHECK-NEXT: v_readlane_b32 s3, v23, 29 +; CHECK-NEXT: v_readlane_b32 s4, v23, 30 +; CHECK-NEXT: v_readlane_b32 s5, v23, 31 +; CHECK-NEXT: v_readlane_b32 s6, v23, 32 +; CHECK-NEXT: v_readlane_b32 s7, v23, 33 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[42:43] +; CHECK-NEXT: ; use s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:55] +; CHECK-NEXT: ; use s[44:47] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 38 -; CHECK-NEXT: v_readlane_b32 s1, v22, 39 -; CHECK-NEXT: v_readlane_b32 s2, v22, 40 -; CHECK-NEXT: v_readlane_b32 s3, v22, 41 +; CHECK-NEXT: v_readlane_b32 s0, v23, 34 +; CHECK-NEXT: v_readlane_b32 s1, v23, 35 +; CHECK-NEXT: v_readlane_b32 s2, v23, 36 +; CHECK-NEXT: v_readlane_b32 s3, v23, 37 +; CHECK-NEXT: v_readlane_b32 s4, v23, 38 +; CHECK-NEXT: v_readlane_b32 s5, v23, 39 +; CHECK-NEXT: v_readlane_b32 s6, v23, 40 +; CHECK-NEXT: v_readlane_b32 s7, v23, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[40:41] +; CHECK-NEXT: ; use s[36:37] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:39] +; CHECK-NEXT: ; use s[40:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:51] +; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v22, 42 -; CHECK-NEXT: v_readlane_b32 s5, v22, 43 -; CHECK-NEXT: v_readlane_b32 s6, v22, 44 -; CHECK-NEXT: v_readlane_b32 s7, v22, 45 -; CHECK-NEXT: v_readlane_b32 s8, v22, 46 -; CHECK-NEXT: v_readlane_b32 s9, v22, 47 -; CHECK-NEXT: v_readlane_b32 s10, v22, 48 -; CHECK-NEXT: v_readlane_b32 s11, v22, 49 -; CHECK-NEXT: v_readlane_b32 s12, v22, 50 -; CHECK-NEXT: v_readlane_b32 s13, v22, 51 -; CHECK-NEXT: v_readlane_b32 s14, v22, 52 -; CHECK-NEXT: v_readlane_b32 s15, v22, 53 +; CHECK-NEXT: v_readlane_b32 s0, v23, 42 +; CHECK-NEXT: v_readlane_b32 s1, v23, 43 +; CHECK-NEXT: v_readlane_b32 s2, v23, 44 +; CHECK-NEXT: v_readlane_b32 s3, v23, 45 +; CHECK-NEXT: v_readlane_b32 s4, v23, 46 +; CHECK-NEXT: v_readlane_b32 s5, v23, 47 +; CHECK-NEXT: v_readlane_b32 s6, v23, 48 +; CHECK-NEXT: v_readlane_b32 s7, v23, 49 +; CHECK-NEXT: v_readlane_b32 s8, v23, 50 +; CHECK-NEXT: v_readlane_b32 s9, v23, 51 +; CHECK-NEXT: v_readlane_b32 s10, v23, 52 +; CHECK-NEXT: v_readlane_b32 s11, v23, 53 +; CHECK-NEXT: v_readlane_b32 s12, v23, 54 +; CHECK-NEXT: v_readlane_b32 s13, v23, 55 +; CHECK-NEXT: v_readlane_b32 s14, v23, 56 +; CHECK-NEXT: v_readlane_b32 s15, v23, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 54 -; CHECK-NEXT: v_readlane_b32 s1, v22, 55 -; CHECK-NEXT: v_readlane_b32 s2, v22, 56 -; CHECK-NEXT: v_readlane_b32 s3, v22, 57 +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 58 -; CHECK-NEXT: v_readlane_b32 s1, v22, 59 -; CHECK-NEXT: v_readlane_b32 s2, v22, 60 -; CHECK-NEXT: v_readlane_b32 s3, v22, 61 -; CHECK-NEXT: v_readlane_b32 s4, v22, 62 -; CHECK-NEXT: v_readlane_b32 s5, v22, 63 -; CHECK-NEXT: v_readlane_b32 s6, v23, 0 -; CHECK-NEXT: v_readlane_b32 s7, v23, 1 +; CHECK-NEXT: v_readlane_b32 s0, v23, 62 +; CHECK-NEXT: v_readlane_b32 s1, v23, 63 +; CHECK-NEXT: v_readlane_b32 s2, v0, 0 +; CHECK-NEXT: v_readlane_b32 s3, v0, 1 +; CHECK-NEXT: v_readlane_b32 s4, v0, 2 +; CHECK-NEXT: v_readlane_b32 s5, v0, 3 +; CHECK-NEXT: v_readlane_b32 s6, v0, 4 +; CHECK-NEXT: v_readlane_b32 s7, v0, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 -; CHECK-NEXT: v_readlane_b32 s4, v23, 6 -; CHECK-NEXT: v_readlane_b32 s5, v23, 7 -; CHECK-NEXT: v_readlane_b32 s6, v23, 8 -; CHECK-NEXT: v_readlane_b32 s7, v23, 9 -; CHECK-NEXT: v_readlane_b32 s8, v23, 10 -; CHECK-NEXT: v_readlane_b32 s9, v23, 11 -; CHECK-NEXT: v_readlane_b32 s10, v23, 12 -; CHECK-NEXT: v_readlane_b32 s11, v23, 13 -; CHECK-NEXT: v_readlane_b32 s12, v23, 14 -; CHECK-NEXT: v_readlane_b32 s13, v23, 15 -; CHECK-NEXT: v_readlane_b32 s14, v23, 16 -; CHECK-NEXT: v_readlane_b32 s15, v23, 17 +; CHECK-NEXT: v_readlane_b32 s0, v0, 6 +; CHECK-NEXT: v_readlane_b32 s1, v0, 7 +; CHECK-NEXT: v_readlane_b32 s2, v0, 8 +; CHECK-NEXT: v_readlane_b32 s3, v0, 9 +; CHECK-NEXT: v_readlane_b32 s4, v0, 10 +; CHECK-NEXT: v_readlane_b32 s5, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 12 +; CHECK-NEXT: v_readlane_b32 s7, v0, 13 +; CHECK-NEXT: v_readlane_b32 s8, v0, 14 +; CHECK-NEXT: v_readlane_b32 s9, v0, 15 +; CHECK-NEXT: v_readlane_b32 s10, v0, 16 +; CHECK-NEXT: v_readlane_b32 s11, v0, 17 +; CHECK-NEXT: v_readlane_b32 s12, v0, 18 +; CHECK-NEXT: v_readlane_b32 s13, v0, 19 +; CHECK-NEXT: v_readlane_b32 s14, v0, 20 +; CHECK-NEXT: v_readlane_b32 s15, v0, 21 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 18 -; CHECK-NEXT: v_readlane_b32 s1, v23, 19 +; CHECK-NEXT: v_readlane_b32 s0, v0, 22 +; CHECK-NEXT: v_readlane_b32 s1, v0, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 20 -; CHECK-NEXT: v_readlane_b32 s1, v23, 21 -; CHECK-NEXT: v_readlane_b32 s2, v23, 22 -; CHECK-NEXT: v_readlane_b32 s3, v23, 23 +; CHECK-NEXT: v_readlane_b32 s0, v0, 24 +; CHECK-NEXT: v_readlane_b32 s1, v0, 25 +; CHECK-NEXT: v_readlane_b32 s2, v0, 26 +; CHECK-NEXT: v_readlane_b32 s3, v0, 27 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 24 -; CHECK-NEXT: v_readlane_b32 s1, v23, 25 -; CHECK-NEXT: v_readlane_b32 s2, v23, 26 -; CHECK-NEXT: v_readlane_b32 s3, v23, 27 -; CHECK-NEXT: v_readlane_b32 s4, v23, 28 -; CHECK-NEXT: v_readlane_b32 s5, v23, 29 -; CHECK-NEXT: v_readlane_b32 s6, v23, 30 -; CHECK-NEXT: v_readlane_b32 s7, v23, 31 +; CHECK-NEXT: v_readlane_b32 s0, v0, 28 +; CHECK-NEXT: v_readlane_b32 s1, v0, 29 +; CHECK-NEXT: v_readlane_b32 s2, v0, 30 +; CHECK-NEXT: v_readlane_b32 s3, v0, 31 +; CHECK-NEXT: v_readlane_b32 s4, v0, 32 +; CHECK-NEXT: v_readlane_b32 s5, v0, 33 +; CHECK-NEXT: v_readlane_b32 s6, v0, 34 +; CHECK-NEXT: v_readlane_b32 s7, v0, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 32 -; CHECK-NEXT: v_readlane_b32 s1, v23, 33 -; CHECK-NEXT: v_readlane_b32 s2, v23, 34 -; CHECK-NEXT: v_readlane_b32 s3, v23, 35 -; CHECK-NEXT: v_readlane_b32 s4, v23, 36 -; CHECK-NEXT: v_readlane_b32 s5, v23, 37 -; CHECK-NEXT: v_readlane_b32 s6, v23, 38 -; CHECK-NEXT: v_readlane_b32 s7, v23, 39 -; CHECK-NEXT: v_readlane_b32 s8, v23, 40 -; CHECK-NEXT: v_readlane_b32 s9, v23, 41 -; CHECK-NEXT: v_readlane_b32 s10, v23, 42 -; CHECK-NEXT: v_readlane_b32 s11, v23, 43 -; CHECK-NEXT: v_readlane_b32 s12, v23, 44 -; CHECK-NEXT: v_readlane_b32 s13, v23, 45 -; CHECK-NEXT: v_readlane_b32 s14, v23, 46 -; CHECK-NEXT: v_readlane_b32 s15, v23, 47 +; CHECK-NEXT: v_readlane_b32 s0, v0, 36 +; CHECK-NEXT: v_readlane_b32 s1, v0, 37 +; CHECK-NEXT: v_readlane_b32 s2, v0, 38 +; CHECK-NEXT: v_readlane_b32 s3, v0, 39 +; CHECK-NEXT: v_readlane_b32 s4, v0, 40 +; CHECK-NEXT: v_readlane_b32 s5, v0, 41 +; CHECK-NEXT: v_readlane_b32 s6, v0, 42 +; CHECK-NEXT: v_readlane_b32 s7, v0, 43 +; CHECK-NEXT: v_readlane_b32 s8, v0, 44 +; CHECK-NEXT: v_readlane_b32 s9, v0, 45 +; CHECK-NEXT: v_readlane_b32 s10, v0, 46 +; CHECK-NEXT: v_readlane_b32 s11, v0, 47 +; CHECK-NEXT: v_readlane_b32 s12, v0, 48 +; CHECK-NEXT: v_readlane_b32 s13, v0, 49 +; CHECK-NEXT: v_readlane_b32 s14, v0, 50 +; CHECK-NEXT: v_readlane_b32 s15, v0, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index ebc916b5c889b..f69b7ae105124 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, 64, s4 ; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 @@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index 3902c6dd422fd..8562cdf195b02 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -100,14 +100,8 @@ entry: } ;. -; NO: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -;. -; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } -;. -; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. ; NO: [[META0]] = !{ptr @bar1, ptr @bar2} ;. diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index b872112922204..a38f0a6d86b8c 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -232,11 +268,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -246,11 +285,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -283,11 +325,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -297,11 +342,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -353,11 +401,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s10, s10, s15 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -367,11 +418,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index 882356d994fc6..0a6009c0e7da8 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,12 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %31.sub0 + ; GCN-NEXT: SI_SPILL_V64_SAVE %31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12 + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 3644bef9c20a1..b2e334c66ccd2 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_add_i32 s10, s10, s15 ; HAWAII-NEXT: s_or_b32 s0, s6, 14 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s11 +; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s7 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] @@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: +; FIJI-NEXT: s_add_i32 s10, s10, s15 ; FIJI-NEXT: s_or_b32 s0, s6, 14 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s7 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 19d633651fdd0..30accc846d2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... +; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 2097579e0c995..4f84b31f1877b 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................ +; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 5 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 775c62e73261a..0b1bd11b88d5d 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s @@ -6,15 +7,17 @@ define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: -; ASM: .amdhsa_next_free_sgpr 5 -; ASM: .amdhsa_reserve_xnack_mask 1 +; ASM: ; %bb.0: ; %entry +; ASM-NEXT: ;;#ASMSTART +; ASM-NEXT: ;;#ASMEND +; ASM-NEXT: s_endpgm ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting. ; OBJ: Contents of section .rodata: ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... +; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 4dfd4c095c87a..2516177691ce3 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_trap 2 @@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -282,6 +288,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -414,10 +423,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 735956caa72da..317e350f3eafe 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 @@ -2371,6 +2413,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; ; GCN-LABEL: fdiv_test_denormals: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: flat_load_sbyte v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index d00ea6dff2447..44b16d7f65dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -126,6 +144,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,6 +179,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_add_i32 s10, s10, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s10, s10, s15 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s10, s10, s15 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 4545c8bbeb3e6..33f629a3c4f0c 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -25,8 +25,9 @@ ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 @@ -40,7 +41,7 @@ ; CHECK-NEXT: BitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: body: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 8215ba834170f..1fbd3760eed26 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -25,8 +25,9 @@ ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 @@ -40,7 +41,7 @@ ; CHECK-NEXT: BitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: body: From fefedb95360dcab3040237c0a4aae0229d59cc6b Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Sat, 17 Aug 2024 16:40:41 -0700 Subject: [PATCH 03/13] Previous code was incorrect for indirect calls of known callees. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 17 +- .../AMDGPU/amdgpu-attributor-no-agpr.ll | 3 +- .../AMDGPU/attributor-flatscratchinit.ll | 359 ++++++++++-------- .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 2 +- .../AMDGPU/remove-no-kernel-id-attribute.ll | 6 +- 5 files changed, 205 insertions(+), 182 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 3ef0694046553..03391a36c54c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -697,20 +697,23 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is // not to be set. bool needFlatScratchInit(Attributor &A) { + assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set + // This is called on each callee; false means callee shouldn't have // no-flat-scratch-init. auto CheckForNoFlatScratchInit = [&](Instruction &I) { const auto &CB = cast(I); const Function *Callee = CB.getCalledFunction(); - if (!Callee) // indirect call - return CB.isInlineAsm(); - if (Callee->isIntrinsic()) - return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull; + if (Callee && Callee->isIntrinsic()) + return Callee->getIntrinsicID() != + Intrinsic::amdgcn_addrspacecast_nonnull; - const auto *CalleeInfo = A.getAAFor( - *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); - return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT); + // Return true for all other cases, including (1)inline asm, (2)direct + // call, and (3)indirect call with known callees. For (2) and (3) + // updateImpl() already checked the callees and we know their + // FLAT_SCRATCH_INIT bit is set. + return true; }; bool UsedAssumedInformation = false; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll index 05ee8cabb5e7c..0e4a9791b6f57 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) { define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr( -; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] { +; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty ; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]] @@ -261,7 +261,6 @@ attributes #0 = { "amdgpu-no-agpr" } ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } ; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } ; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index 6d9da9281211c..f04c93961b670 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -1,6 +1,6 @@ ; Test the generation of the attribute amdgpu-no-flat-scratch-init -; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s ;; tests of alloca @@ -160,40 +160,40 @@ define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { define void @call_without_alloca() { ; GFX9-LABEL: define void @call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] call void @without_alloca(i1 true) ret void } define amdgpu_kernel void @call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] call void @without_alloca(i1 true) ret void } define void @call_with_alloca() { ; GFX9-LABEL: define void @call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @with_alloca() ret void } define amdgpu_kernel void @call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @with_alloca() ret void } @@ -222,50 +222,50 @@ define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { define void @call_call_without_alloca() { ; GFX9-LABEL: define void @call_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] call void @call_without_alloca() ret void } define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] call void @call_without_alloca() ret void } define void @call_call_with_alloca() { ; GFX9-LABEL: define void @call_call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @call_with_alloca() ret void } define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @call_with_alloca() ret void } define void @with_alloca_call_without_alloca() { ; GFX9-LABEL: define void @with_alloca_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @without_alloca() @@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() { define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @without_alloca() @@ -286,10 +286,10 @@ define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { define void @with_alloca_call_with_alloca() { ; GFX9-LABEL: define void @with_alloca_call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @with_alloca() @@ -298,10 +298,10 @@ define void @with_alloca_call_with_alloca() { define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @with_alloca() @@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { define void @with_alloca_call_call_without_alloca() { ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_without_alloca() @@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() { define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_without_alloca() @@ -334,10 +334,10 @@ define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { define void @with_alloca_call_call_with_alloca() { ; GFX9-LABEL: define void @with_alloca_call_call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_with_alloca() @@ -346,10 +346,10 @@ define void @with_alloca_call_call_with_alloca() { define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_with_alloca() @@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(2) %ptr ret void } define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(2) %ptr ret void } define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(2) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(2) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(3) %ptr ret void } define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(3) %ptr ret void } define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -486,30 +486,30 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(4) %ptr ret void } define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(4) %ptr ret void } define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(4) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -517,10 +517,10 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(4) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(5) %ptr ret void } define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(5) %ptr ret void } define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -632,78 +632,70 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } - - - - - - - - define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -712,10 +704,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace( define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -724,10 +716,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -736,10 +728,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -750,10 +742,10 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] call void @without_alloca(i1 true) call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -761,10 +753,10 @@ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] call void @without_alloca(i1 true) call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -772,10 +764,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] call void @without_alloca(i1 true) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -783,16 +775,16 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] call void @without_alloca(i1 true) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } -;; tests of indirect call, intrinsics +;; tests of indirect call, intrinsics, inline asm @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 @@ -838,14 +830,33 @@ define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { ret void } +define void @empty() { + ret void +} + +define void @also_empty() { + ret void +} + +define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) { +; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; +; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] + %fptr = select i1 %cond, ptr @empty, ptr @also_empty + call void %fptr() + ret void +} + declare i32 @llvm.amdgcn.workgroup.id.x() define void @use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI7:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI7:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) undef ret void @@ -853,10 +864,10 @@ define void @use_intrinsic_workitem_id_x() { define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) undef ret void @@ -864,84 +875,96 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { define void @call_use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI7:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI4]] ; ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI7:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI4]] call void @use_intrinsic_workitem_id_x() ret void } define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR_GFX9_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR_GFX10_NOFSI5:[0-9]+]] call void @use_intrinsic_workitem_id_x() ret void } +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; +; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} -; GFX9: attributes #[[ATTR_GFX9_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR_GFX9_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) +; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; +; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) +; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] + call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) + ret void +} -; GFX9: attributes #[[ATTR_GFX9_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +define amdgpu_kernel void @with_inline_asm() { +; GFX9-LABEL: with_inline_asm +; GFX9-SAME: #[[ATTR_GFX9_NOFSI3]] +; +; GFX10-LABEL: with_inline_asm +; GFX10-SAME: #[[ATTR_GFX10_NOFSI3]] + call void asm sideeffect "; use $0", "a"(i32 poison) + ret void +} -; GFX9: attributes #[[ATTR_GFX9_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; GFX9: attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; GFX9: attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX10: attributes #[[ATTR_GFX10_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } ; GFX10: attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } ; GFX10: attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } + +; GFX10: attributes #[[ATTR_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index e911df4aa4da3..eb4cf5c063d10 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -407,7 +407,7 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 667db7b1ebbd0..58a6437bcbf69 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() { define internal i16 @mutual_recursion_0(i16 %arg) { ; CHECK-LABEL: define internal i16 @mutual_recursion_0( -; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5:[0-9]+]] { +; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4 @@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) { define internal void @mutual_recursion_1(i16 %arg) { ; CHECK-LABEL: define internal void @mutual_recursion_1( -; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5]] { +; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]]) ; CHECK-NEXT: ret void ; @@ -196,8 +196,6 @@ define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. From 0a0025b81e6177ee0a90ca20f2dccd5ce235dfb1 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 21 Aug 2024 11:27:13 -0700 Subject: [PATCH 04/13] Undo changes in AMDGPUSubtarget.cpp to reduce impact on test files. Those code changes will be in a follow-up PR. --- .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 368 +---- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 390 +---- .../AMDGPU/GlobalISel/extractelement.ll | 69 +- ...licit-kernarg-backend-usage-global-isel.ll | 36 +- .../GlobalISel/insertelement-stack-lower.ll | 2 +- .../AMDGPU/GlobalISel/lds-global-value.ll | 5 +- .../GlobalISel/llvm.amdgcn.if.break.i64.ll | 3 - .../GlobalISel/llvm.amdgcn.trig.preop.ll | 24 - .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 33 - .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 33 - .../abi-attribute-hints-undefined-behavior.ll | 18 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 10 +- llvm/test/CodeGen/AMDGPU/always-uniform.ll | 3 - ...amdgpu-codegenprepare-fold-binop-select.ll | 3 - .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +- .../attr-amdgpu-flat-work-group-size.ll | 4 +- .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 6 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 4 +- .../attributor-flatscratchinit-globalisel.ll | 54 +- llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 2 +- .../callee-special-input-sgprs-fixed-abi.ll | 36 +- llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 12 +- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 3 - ...dagcomb-extract-vec-elt-different-sizes.ll | 2 - ...cannot-create-empty-or-backward-segment.ll | 2 +- .../expand-scalar-carry-out-select-user.ll | 3 - llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 66 - .../fast-unaligned-load-store.global.ll | 20 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 236 +-- .../flat-for-global-subtarget-feature.ll | 7 +- llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 63 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 48 - llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 60 - .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 3 - llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 62 +- llvm/test/CodeGen/AMDGPU/half.ll | 231 --- .../AMDGPU/hsa-metadata-kernel-code-props.ll | 9 +- llvm/test/CodeGen/AMDGPU/hsa.ll | 4 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 36 +- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 58 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 +-- llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 2 +- .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 8 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 70 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 114 +- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 126 +- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 6 - llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 127 +- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 85 +- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 18 - llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 164 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 129 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 83 +- llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 3 +- .../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +- .../match-perm-extract-vector-elt-bug.ll | 8 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 20 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 1380 ----------------- .../memory-legalizer-flat-nontemporal.ll | 75 - .../memory-legalizer-flat-singlethread.ll | 1380 ----------------- .../AMDGPU/memory-legalizer-flat-system.ll | 1380 ----------------- .../AMDGPU/memory-legalizer-flat-volatile.ll | 66 - .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ---------------- .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ---------------- .../AMDGPU/memory-legalizer-global-agent.ll | 273 ---- .../memory-legalizer-global-nontemporal.ll | 15 - .../memory-legalizer-global-singlethread.ll | 276 ---- .../AMDGPU/memory-legalizer-global-system.ll | 261 ---- .../memory-legalizer-global-volatile.ll | 18 - .../memory-legalizer-global-wavefront.ll | 276 ---- .../memory-legalizer-global-workgroup.ll | 276 ---- .../memory-legalizer-local-nontemporal.ll | 9 - .../AMDGPU/memory-legalizer-local-volatile.ll | 6 - .../memory-legalizer-private-nontemporal.ll | 59 +- .../memory-legalizer-private-volatile.ll | 30 +- llvm/test/CodeGen/AMDGPU/min.ll | 210 --- llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 21 - llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 18 - ...al-regcopy-and-spill-missed-at-regalloc.ll | 56 +- llvm/test/CodeGen/AMDGPU/sad.ll | 68 +- .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 16 - .../scc-clobbered-sgpr-to-vmem-spill.ll | 456 +++--- .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 2 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 24 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 70 +- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 6 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 6 - llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 2 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 2 +- llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 9 +- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 16 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 45 - llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 91 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 2 +- ...ine-function-info-long-branch-reg-debug.ll | 7 +- .../machine-function-info-long-branch-reg.ll | 7 +- 97 files changed, 644 insertions(+), 12190 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index cb64c25b5f080..705bcbddf227a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -105,14 +99,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -123,14 +114,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -303,9 +291,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -321,9 +306,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -383,9 +365,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -403,9 +382,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -468,9 +444,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -488,9 +461,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -553,9 +523,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -568,9 +535,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -621,9 +585,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -638,9 +599,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -694,9 +652,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -711,9 +666,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -768,9 +720,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -778,7 +728,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -793,9 +742,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -803,7 +750,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -868,9 +814,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -888,9 +831,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -950,9 +890,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -968,9 +905,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,8 +920,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1002,10 +934,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1042,9 +970,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1062,9 +987,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1082,8 +1004,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1098,10 +1018,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1141,9 +1057,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1161,9 +1074,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1181,8 +1091,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1197,10 +1105,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1240,9 +1144,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1255,9 +1156,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1270,8 +1168,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1283,10 +1179,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1319,9 +1211,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1336,9 +1225,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1353,8 +1239,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1366,10 +1250,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,9 +1285,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1422,9 +1299,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1439,8 +1313,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1452,10 +1324,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,9 +1360,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1502,7 +1368,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -1517,9 +1382,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1527,7 +1390,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1542,8 +1404,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1562,10 +1422,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -1622,9 +1478,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1642,9 +1495,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1662,8 +1512,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1677,10 +1525,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1727,13 +1571,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1751,13 +1592,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1775,9 +1613,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1792,10 +1628,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1834,15 +1666,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1860,15 +1689,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1886,9 +1712,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1903,10 +1727,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1948,13 +1768,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1964,13 +1781,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1980,9 +1794,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1994,10 +1806,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2032,15 +1840,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2050,15 +1855,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2068,9 +1870,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -2082,10 +1882,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2123,15 +1919,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2141,15 +1934,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2159,9 +1949,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -2173,10 +1961,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2215,9 +1999,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2244,9 +2025,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2273,14 +2051,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2294,10 +2070,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2356,9 +2128,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2377,9 +2146,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2398,14 +2164,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2414,10 +2178,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,11 +2231,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2492,11 +2249,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2572,10 +2326,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2591,10 +2342,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2662,10 +2410,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2681,10 +2426,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2870,13 +2612,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2889,13 +2628,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2955,15 +2691,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2976,15 +2709,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3045,15 +2775,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3066,15 +2793,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3135,13 +2859,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3151,13 +2872,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3208,15 +2926,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3226,15 +2941,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3286,15 +2998,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3304,15 +3013,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3365,9 +3071,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3391,9 +3094,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3470,9 +3170,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3491,9 +3188,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3564,10 +3258,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v4, s3 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -3586,10 +3277,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 00ff2d7a35d56..b3a7e65f771c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -105,14 +99,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -123,14 +114,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -303,9 +291,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -321,9 +306,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -383,9 +365,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -403,9 +382,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -468,9 +444,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -488,9 +461,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -553,9 +523,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -568,9 +535,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -621,9 +585,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -638,9 +599,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -694,9 +652,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -711,9 +666,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -768,9 +720,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -778,7 +728,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -793,9 +742,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -803,7 +750,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -868,9 +814,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -888,9 +831,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -956,11 +896,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -977,11 +914,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1057,10 +991,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1076,10 +1007,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1147,10 +1075,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1166,10 +1091,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1355,13 +1277,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1374,13 +1293,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1440,15 +1356,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1461,15 +1374,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1530,15 +1440,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1551,15 +1458,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1620,13 +1524,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1636,13 +1537,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1693,15 +1591,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1711,15 +1606,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1771,15 +1663,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1789,15 +1678,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1850,9 +1736,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1876,9 +1759,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1955,9 +1835,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1976,9 +1853,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2042,9 +1916,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2060,9 +1931,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2078,8 +1946,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2094,10 +1960,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2134,9 +1996,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2154,9 +2013,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2174,8 +2030,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2190,10 +2044,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2233,9 +2083,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2253,9 +2100,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2273,8 +2117,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2289,10 +2131,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2332,9 +2170,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2347,9 +2182,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2362,8 +2194,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2375,10 +2205,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2411,9 +2237,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2428,9 +2251,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2445,8 +2265,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2458,10 +2276,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2497,9 +2311,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2514,9 +2325,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2531,8 +2339,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2544,10 +2350,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2584,9 +2386,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2594,7 +2394,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -2609,9 +2408,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2619,7 +2416,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -2634,8 +2430,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2654,10 +2448,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -2714,9 +2504,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2734,9 +2521,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2754,8 +2538,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2769,10 +2551,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2826,10 +2604,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v4, s3 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -2848,10 +2623,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 @@ -2924,13 +2696,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2948,13 +2717,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2972,9 +2738,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2989,10 +2753,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3031,15 +2791,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3057,15 +2814,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3083,9 +2837,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3100,10 +2852,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3145,15 +2893,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3171,15 +2916,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3197,9 +2939,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3214,10 +2954,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3259,13 +2995,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3275,13 +3008,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3291,9 +3021,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3305,10 +3033,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3343,15 +3067,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3361,15 +3082,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3379,9 +3097,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3393,10 +3109,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3434,15 +3146,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3452,15 +3161,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3470,9 +3176,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3484,10 +3188,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3526,9 +3226,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3555,9 +3252,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3584,14 +3278,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3605,10 +3297,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3667,9 +3355,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3688,9 +3373,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3709,14 +3391,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3725,10 +3405,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3779,7 +3455,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_add_i32 s10, s10, s15 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -3787,8 +3462,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3802,7 +3475,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -3810,8 +3482,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 868b530e42a21..34efb089b72bf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3037,7 +3037,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 10 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3052,7 +3052,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3069,7 +3069,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 15 +; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -3128,7 +3128,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 10 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3143,7 +3143,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3160,7 +3160,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 22 +; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -3178,24 +3178,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 -; MOVREL-NEXT: s_add_i32 s10, s10, s15 -; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s2, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 -; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 -; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3223,7 +3220,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 10 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3238,7 +3235,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4057,7 +4054,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 10 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4072,7 +4069,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4089,7 +4086,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 14 +; GPRIDX-NEXT: wavefront_sgpr_count = 12 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4130,7 +4127,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4141,7 +4138,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 10 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4156,7 +4153,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4173,7 +4170,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 22 +; MOVREL-NEXT: wavefront_sgpr_count = 8 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4191,9 +4188,6 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s2, s[6:7], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; MOVREL-NEXT: s_add_i32 s10, s10, s15 -; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4229,7 +4223,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 10 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4244,7 +4238,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4407,7 +4401,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 10 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4422,7 +4416,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4439,7 +4433,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 15 +; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4483,7 +4477,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4494,7 +4488,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 10 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4509,7 +4503,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4526,7 +4520,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 22 +; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4544,12 +4538,10 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; MOVREL-NEXT: s_add_i32 s10, s10, s15 -; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; MOVREL-NEXT: s_mov_b32 s2, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 -; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] @@ -4557,7 +4549,6 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4585,7 +4576,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 10 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4600,7 +4591,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 70b889389ff99..9443b39dcdc03 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -12,9 +12,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 ; GFX8V4-NEXT: s_mov_b32 s5, s3 @@ -25,7 +23,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 -; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V4-NEXT: flat_store_dword v[0:1], v2 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) @@ -40,9 +37,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 @@ -52,7 +47,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 -; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) @@ -66,10 +60,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s2, s0 ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -78,7 +71,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -92,10 +84,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s2, s0 ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -104,7 +95,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) @@ -127,9 +117,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -143,9 +130,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -189,9 +173,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -205,9 +186,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -291,10 +269,7 @@ define amdgpu_kernel void @llvm_debugtrap() { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 ; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc @@ -320,9 +295,6 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8V5-NEXT: s_add_u32 s0, s6, 8 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 7aa3b5bb10990..4fcde0f2fc7cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[20:23], s[6:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s13 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index 6ed2df430998f..c3938e673a6da 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,16 +11,13 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace( ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: s_add_i32 s10, s10, s15 ; CHECK-NEXT: ds_read_b32 v2, v0 -; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index 3a90c3ee90803..d7a82b415ff06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -6,9 +6,6 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[6:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xa -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index 96fd14f52d13b..f3654fea486e0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -42,9 +42,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -62,9 +59,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -82,8 +76,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -93,10 +85,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 @@ -125,9 +113,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -143,9 +128,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -161,8 +143,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -171,10 +151,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s10, s10, s15 -; GFX10-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index f0ec0d101f5be..7d7f450e590fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -7,9 +7,6 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s8, s5, 31 ; GFX8-NEXT: s_add_i32 s0, s5, s8 @@ -148,9 +145,6 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 @@ -622,9 +616,6 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 @@ -854,9 +845,6 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1283,9 +1271,6 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2203,9 +2188,6 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2351,9 +2333,6 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 @@ -2618,9 +2597,6 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2766,9 +2742,6 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 @@ -3030,9 +3003,6 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -3184,9 +3154,6 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 2be04ace99e36..5aef667934709 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -7,9 +7,6 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX8-NEXT: s_sub_i32 s0, 0, s5 @@ -115,9 +112,6 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 @@ -528,9 +522,6 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 @@ -694,9 +685,6 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -991,10 +979,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_i32 s10, s10, s15 ; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 @@ -1787,9 +1772,6 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -1903,9 +1885,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -2102,9 +2081,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s4, 16 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -2217,10 +2193,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2414,9 +2387,6 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -2535,9 +2505,6 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index b8ffa4f14c3e5..e53653408feb4 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -135,9 +135,6 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workitem_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 -; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 -; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1 ; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0 @@ -184,19 +181,16 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 -; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 -; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_endpgm @@ -244,9 +238,6 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 { define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 { ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: -; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 -; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 -; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 @@ -270,10 +261,7 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) # define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 { ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr: ; FIXEDABI: ; %bb.0: -; FIXEDABI-NEXT: s_add_i32 s4, s4, s9 ; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0 -; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 9534561e6e280..62f6890e92662 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -69,7 +69,7 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 { ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 -; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen @@ -254,8 +254,8 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] -; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 -; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]] @@ -279,8 +279,8 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] -; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 -; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index e0c69706bad79..0a461f9ee6c96 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -8,10 +8,8 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s0, s0, s4 @@ -20,7 +18,6 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index 89c5303e0e81d..bf72cccd912ce 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -393,14 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v0, 0x83 ; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 54a800ecee9f1..a6d8c6f41eee5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 83 -; TRAP-HANDLER-DISABLE: NumSgprs: 98 +; TRAP-HANDLER-ENABLE: NumSgprs: 77 +; TRAP-HANDLER-DISABLE: NumSgprs: 92 define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out0, i32 %in0, ptr addrspace(1) %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index 22cc5af30da66..fc13b86566f76 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -35,9 +35,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_1024 -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 0 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 24{{$}} +; CHECK: NumSGPRsForWavesPerEU: 2{{$}} ; CHECK: NumVGPRsForWavesPerEU: 43 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_1024() #3 { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index 3ddf8be052e4a..ed045107d354d 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -4,8 +4,8 @@ ; ALL-LABEL: {{^}}max_10_sgprs: -; ALL: SGPRBlocks: 2 -; ALL: NumSGPRsForWavesPerEU: 22 +; ALL: SGPRBlocks: 1 +; ALL: NumSGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_sgprs() #0 { %one = load volatile i32, ptr addrspace(4) undef %two = load volatile i32, ptr addrspace(4) undef @@ -125,7 +125,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1 declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1 -attributes #0 = { nounwind "amdgpu-num-sgpr"="18" } +attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 26dc3e13a72cf..14519f5a5e77c 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 3 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 26 +; CHECK: NumSGPRsForWavesPerEU: 20 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll index f9f48aacfc2bf..b7503f26b1ab6 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -503,38 +503,31 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; ; GFX10: name: with_alloca_cc_vs ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } ; ; GFX10: name: with_alloca_cc_gs ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } ; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr5' } ; ; GFX10: name: with_alloca_cc_ps ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } ; ; GFX10: name: with_alloca_cc_cs ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } ; ; GFX10: name: with_alloca_cc_hs ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } ; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr5' } ; ; GFX10: name: with_alloca_cc_ls ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } ; ; GFX10: name: with_alloca_cc_es ; GFX10: argumentInfo: -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr0_sgpr1' } -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr2' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } ; ; GFX10: name: with_alloca_cc_gfx ; GFX10: argumentInfo: @@ -701,8 +694,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } ; ; GFX10: name: without_region_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -730,8 +723,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: without_group_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -759,8 +751,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: without_constant_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -788,8 +779,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -817,8 +807,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -846,8 +835,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -861,8 +849,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: call_call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -890,8 +877,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -905,8 +891,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -920,8 +905,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -935,8 +919,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: call_without_alloca_and_without_addrspacecast ; GFX10: argumentInfo: @@ -964,8 +947,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; ; GFX10: name: with_indirect_call ; GFX10: argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 470c444ad8cd7..90562e25a3e9c 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -23,7 +23,7 @@ ; COV5: .amdhsa_user_sgpr_queue_ptr 0 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1 -; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1 +; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 25a5207d3bbb7..8ef2d89e76d4e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: ; GCN-NOT: s6 -; GCN: s_mov_b32 s12, s4 +; GCN: s_mov_b32 s12, s6 ; GCN: s_mov_b32 s32, 0 -; GCN: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+12 +; GCN: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x@rel32@hi+12 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm @@ -216,7 +216,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y: ; GCN-NOT: s12 -; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s13, s7 ; GCN-NOT: s12 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -232,7 +232,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z: ; GCN-NOT: s12 ; GCN-NOT: s13 -; GCN: s_mov_b32 s14, s5 +; GCN: s_mov_b32 s14, s7 ; GCN-NOT: s12 ; GCN-NOT: s13 @@ -249,8 +249,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy: ; GCN-NOT: s14 -; GCN: s_mov_b32 s12, s4 -; GCN-NEXT: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s12, s6 +; GCN-NEXT: s_mov_b32 s13, s7 ; GCN-NOT: s14 ; GCN: s_mov_b32 s32, 0 @@ -265,9 +265,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { } ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz: -; GCN: s_mov_b32 s12, s4 -; GCN: s_mov_b32 s13, s5 -; GCN: s_mov_b32 s14, s6 +; GCN: s_mov_b32 s12, s6 +; GCN: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s14, s8 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -282,8 +282,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz: ; GCN-NOT: s13 -; GCN: s_mov_b32 s12, s4 -; GCN-NEXT: s_mov_b32 s14, s5 +; GCN: s_mov_b32 s12, s6 +; GCN-NEXT: s_mov_b32 s14, s7 ; GCN-NOT: s13 ; GCN: s_mov_b32 s32, 0 @@ -299,8 +299,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz: -; GCN: s_mov_b32 s13, s5 -; GCN: s_mov_b32 s14, s6 +; GCN: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s14, s8 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -381,7 +381,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { ; GCN-NOT: s13 ; GCN-NOT: s14 -; GCN-DAG: s_mov_b32 s12, s4 +; GCN-DAG: s_mov_b32 s12, s6 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: s13 ; GCN-NOT: s14 @@ -399,7 +399,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s13, s5 +; GCN-DAG: s_mov_b32 s13, s7 ; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -414,7 +414,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s14, s5 +; GCN-DAG: s_mov_b32 s14, s7 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 48ad2fe687804..3035a8579c8a6 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -12,13 +12,13 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 -; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1 +; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel ; OSABI-AMDHSA-ASM: .text @@ -31,13 +31,13 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 -; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1 +; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel ; OSABI-AMDHSA-ASM: .text diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index c97d333800602..5fbcd0bf66999 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,9 +5,6 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-NEXT: s_add_i32 s10, s10, s15 -; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index f586f6d16e0ef..297fe7618672e 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,8 +6,6 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 39328d706ba26..85ed2914b8c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-NEXT: s_load_dword s14, s[6:7], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s15 +; CHECK-NEXT: s_add_u32 s24, s24, s13 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index 05a245cd3443c..54fb1dc5c0527 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -94,9 +94,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb ; GFX7-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 7fde702bd49ac..21799ab79b839 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -13,9 +13,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,9 +25,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -74,9 +68,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -89,9 +80,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -134,9 +122,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,9 +134,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -193,9 +175,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -209,9 +188,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -256,9 +232,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -276,9 +249,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -325,9 +295,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -342,9 +309,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -394,9 +358,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -409,9 +370,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -457,9 +415,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -486,9 +441,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -547,12 +499,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_lshr_b32 s2, s4, 16 @@ -578,12 +527,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -643,9 +589,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -669,9 +612,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -742,9 +682,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -763,9 +700,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 4df6b8d066915..7252c69cb1cf7 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -74,9 +74,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GFX7-ALIGNED-NEXT: s_add_i32 s10, s10, s15 -; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -93,9 +90,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GFX7-UNALIGNED-NEXT: s_add_i32 s10, s10, s15 -; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -226,10 +220,8 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GFX7-ALIGNED-NEXT: s_add_i32 s10, s10, s15 -; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -239,7 +231,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 @@ -256,9 +247,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GFX7-UNALIGNED-NEXT: s_add_i32 s10, s10, s15 -; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -371,9 +359,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GFX7-ALIGNED-NEXT: s_add_i32 s10, s10, s15 -; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -384,9 +369,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GFX7-UNALIGNED-NEXT: s_add_i32 s10, s10, s15 -; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index 6482749bd2fb7..f0ce96af90649 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -24,9 +24,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -83,9 +80,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -97,9 +91,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -149,9 +140,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -208,9 +196,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -268,9 +253,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -327,9 +309,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -373,9 +352,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -419,9 +395,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -468,9 +441,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -515,9 +485,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -562,9 +529,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -609,9 +573,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -655,13 +616,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -706,13 +664,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -757,13 +712,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -808,9 +760,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -855,9 +804,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -904,9 +850,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -951,9 +894,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -998,9 +938,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1045,9 +982,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1092,9 +1026,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1139,9 +1070,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1186,9 +1114,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1233,9 +1158,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1280,9 +1202,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1338,9 +1257,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1351,9 +1267,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1400,9 +1313,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1459,9 +1369,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1519,9 +1426,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1578,13 +1482,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1630,13 +1531,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1680,13 +1578,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1728,13 +1623,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1776,13 +1668,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1824,13 +1713,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1876,13 +1762,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1927,13 +1810,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1977,13 +1857,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2028,13 +1905,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2076,13 +1950,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2124,13 +1995,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2172,13 +2040,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2220,13 +2085,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2268,13 +2130,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2316,13 +2175,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX678-NEXT: s_add_i32 s10, s10, s15 -; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2365,9 +2221,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2385,9 +2238,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2455,9 +2305,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2475,9 +2322,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2545,9 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2566,9 +2407,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2637,9 +2475,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2663,9 +2498,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2736,9 +2568,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2756,9 +2585,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2826,9 +2652,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2846,9 +2669,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2917,9 +2737,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2938,9 +2755,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -3010,9 +2824,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -3036,9 +2847,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -3108,9 +2916,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX6-NEXT: s_add_i32 s10, s10, s15 -; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -3128,9 +2933,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll index ab00b132f2fd1..fee6540f43c64 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -6,13 +6,18 @@ ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s +; There are no stack objects even though flat is used by default, so +; flat_scratch_init should be disabled. + ; ALL-LABEL: {{^}}test: +; ALL-NOT: flat_scr + ; HSA-DEFAULT: flat_store_dword ; HSA-NODEFAULT: buffer_store_dword ; HSA-NOADDR64: flat_store_dword -; HSA: .amdhsa_user_sgpr_flat_scratch_init 1 +; HSA: .amdhsa_user_sgpr_flat_scratch_init 0 ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 134b790238086..e4ffedd686ac9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=FIJI-NOXNACK -check-prefix=GCN %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s @@ -8,16 +8,16 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: @@ -28,12 +28,8 @@ ; CI: ; NumSgprs: 8 ; VI-NOXNACK: ; NumSgprs: 8 ; VI-XNACK: ; NumSgprs: 12 -; HSA-VI-NOXNACK: ; NumSgprs: 8 -; HSA-VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8 +; GFX9-ARCH-FLAT: ; NumSgprs: 14 +; GFX10-ARCH-FLAT: ; NumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -49,12 +45,8 @@ entry: ; CI: ; NumSgprs: 10 ; VI-NOXNACK: ; NumSgprs: 10 ; VI-XNACK: ; NumSgprs: 12 -; HSA-VI-NOXNACK: ; NumSgprs: 10 -; HSA-VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10 +; GFX9-ARCH-FLAT: ; NumSgprs: 14 +; GFX10-ARCH-FLAT: ; NumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -68,15 +60,10 @@ entry: ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; NumSgprs: 12 -; FIJI-NOXNACK: ; NumSgprs: 14 ; VI-NOXNACK: ; NumSgprs: 14 ; VI-XNACK: ; NumSgprs: 14 -; HSA-VI-NOXNACK: ; NumSgprs: 24 -; HSA-VI-XNACK: ; NumSgprs: 24 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8 +; GFX9-ARCH-FLAT: ; NumSgprs: 14 +; GFX10-ARCH-FLAT: ; NumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -92,10 +79,8 @@ entry: ; CI: ; NumSgprs: 12 ; VI-NOXNACK: ; NumSgprs: 14 ; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10 +; GFX9-ARCH-FLAT: ; NumSgprs: 14 +; GFX10-ARCH-FLAT: ; NumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -114,10 +99,8 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0 +; GFX9-ARCH-FLAT: ; NumSgprs: 6 +; GFX10-ARCH-FLAT: ; NumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -133,10 +116,8 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0 +; GFX9-ARCH-FLAT: ; NumSgprs: 6 +; GFX10-ARCH-FLAT: ; NumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -152,10 +133,8 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6 -; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6 -; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0 -; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0 +; GFX9-ARCH-FLAT: ; NumSgprs: 6 +; GFX10-ARCH-FLAT: ; NumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 23b453438f3a0..c60b9858abd83 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -16,9 +16,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -85,11 +82,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-NEXT: s_load_dword s3, s[6:7], 0x2c -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: v_add_f32_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -149,9 +143,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -209,9 +200,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 @@ -275,9 +263,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -328,13 +313,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -380,9 +362,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 -; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -401,9 +380,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 -; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -522,9 +498,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 -; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -546,9 +519,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 -; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -649,9 +619,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 -; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -673,9 +640,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 -; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 @@ -778,8 +742,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 -; VI-DENORM-NEXT: s_add_i32 s10, s10, s15 -; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -787,7 +749,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1 ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 -; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-DENORM-NEXT: s_add_u32 s4, s2, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 ; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0 @@ -804,8 +765,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_add_i32 s10, s10, s15 -; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -813,7 +772,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0 @@ -917,9 +875,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -973,13 +928,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index c4f13749251a4..8267bb9f5450f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -8,9 +8,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -28,9 +25,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -77,9 +71,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -98,9 +89,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -151,9 +139,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -166,9 +151,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -213,9 +195,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -228,9 +207,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -273,9 +249,6 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -321,9 +294,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 @@ -346,9 +316,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -357,7 +325,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -402,9 +369,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -417,9 +381,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -462,9 +423,6 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -510,9 +468,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| @@ -534,9 +489,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -544,7 +497,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -586,9 +538,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -606,9 +555,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -665,9 +611,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -692,9 +635,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s1, s4, 16 @@ -703,7 +644,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff ; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 7137834b6552e..98b17bbaa0a95 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1478,8 +1478,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bitcmp1_b32 s8, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1491,7 +1489,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 6e22c9c319f69..40982347f3ca0 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -10,9 +10,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -25,9 +22,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -72,9 +66,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -89,9 +80,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -141,9 +129,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -156,9 +141,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -201,9 +183,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -221,9 +200,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -270,9 +246,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -285,9 +258,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -329,17 +299,14 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND ; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm ; @@ -382,9 +349,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -399,9 +363,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -451,9 +412,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -466,9 +424,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -511,9 +466,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -540,9 +492,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -590,9 +539,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -614,9 +560,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -681,9 +624,6 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 6207c442f41ee..3735c6349fbb3 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -10,9 +10,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -24,9 +21,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -54,9 +48,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -68,9 +59,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -97,9 +85,6 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -135,9 +120,6 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -165,9 +147,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -182,9 +161,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -217,9 +193,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -233,9 +206,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -269,9 +239,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -283,9 +250,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -315,9 +279,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -331,9 +292,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -366,9 +324,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -382,9 +337,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -417,9 +369,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -435,9 +384,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -475,9 +421,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -506,9 +449,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -567,9 +507,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -583,9 +520,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[6:7], 0x8 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -619,9 +553,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -638,9 +569,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[6:7], 0x8 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -680,9 +608,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -704,9 +629,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -754,9 +676,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -782,9 +701,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -840,9 +756,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -890,9 +803,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -980,9 +890,6 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1013,9 +920,6 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1046,9 +950,6 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -1079,9 +980,6 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,9 +1010,6 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,9 +1043,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1167,9 +1059,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1207,9 +1096,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1227,9 +1113,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1269,9 +1152,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1291,9 +1171,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1336,9 +1213,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1369,9 +1243,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1430,9 +1301,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1491,9 +1359,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1593,9 +1458,6 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1632,9 +1494,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1653,9 +1512,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1698,9 +1554,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1726,9 +1579,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,9 +1632,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1813,9 +1660,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1875,9 +1719,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1926,9 +1767,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2015,9 +1853,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2113,9 +1948,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2271,9 +2103,6 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2307,9 +2136,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2327,9 +2153,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2368,9 +2191,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2394,9 +2214,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2444,9 +2261,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2468,9 +2282,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2515,9 +2326,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2553,9 +2361,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2621,9 +2426,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2693,9 +2495,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2808,9 +2607,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[6:7], 0x2 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -2828,9 +2624,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2863,9 +2656,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2887,9 +2677,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2923,9 +2710,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2963,9 +2747,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3006,9 +2787,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3067,9 +2845,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3134,9 +2909,6 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -3168,9 +2940,6 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 51b0e2b86cdf3..8c017fa5ec263 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -15,10 +15,7 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; GFX700: .sgpr_count: 22 -; GFX803: .sgpr_count: 24 -; GFX900: .sgpr_count: 10 -; GFX1010: .sgpr_count: 10 +; CHECK: .sgpr_count: 10 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 @@ -51,8 +48,8 @@ entry: ; CHECK: .name: num_spilled_sgprs ; GFX700: .sgpr_spill_count: 10 -; GFX803: .sgpr_spill_count: 0 -; GFX900: .sgpr_spill_count: 0 +; GFX803: .sgpr_spill_count: 10 +; GFX900: .sgpr_spill_count: 62 ; GFX1010: .sgpr_spill_count: 60 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 2c38e201d326f..37476203fbfad 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -43,7 +43,7 @@ ; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269 ; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978 ; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63 -; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C +; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C ; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C ; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072 ; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370 @@ -59,7 +59,7 @@ ; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172 ; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D ; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB -; ELF: 0210: 2E736770 725F636F 756E740C B12E7367 +; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367 ; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7 ; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E ; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 8f9b223c361d6..1b12f668e207c 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -12,9 +12,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0 @@ -24,7 +22,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX8V4-NEXT: s_cselect_b32 s0, s2, 0 ; GFX8V4-NEXT: s_cselect_b32 s1, s1, 0 -; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX8V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v4 @@ -38,9 +35,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0 @@ -50,7 +45,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: v_mov_b32_e32 v1, s2 ; GFX8V5-NEXT: s_cselect_b32 s0, s3, 0 ; GFX8V5-NEXT: s_cselect_b32 s1, s1, 0 -; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX8V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v4 @@ -63,10 +57,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0 @@ -76,7 +69,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: v_mov_b32_e32 v1, s2 ; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V4-NEXT: flat_store_dword v[0:1], v4 @@ -89,10 +81,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0 @@ -102,7 +93,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: v_mov_b32_e32 v1, s2 ; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V5-NEXT: flat_store_dword v[0:1], v4 @@ -124,9 +114,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 ; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -139,9 +126,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc ; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -184,9 +168,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 ; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -199,9 +180,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 ; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -285,10 +263,7 @@ define amdgpu_kernel void @llvm_debugtrap() { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_add_i32 s12, s12, s17 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 ; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc @@ -313,9 +288,6 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_add_i32 s10, s10, s15 -; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8V5-NEXT: s_add_u32 s0, s6, 8 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index ff5c71b65f34b..4d62d30a38ed3 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() @@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() @@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index cf6ae3d5b4f68..c68138acc9b2b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -22,9 +22,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -85,9 +82,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -150,9 +144,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -225,9 +216,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -298,9 +286,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -373,9 +358,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -453,14 +435,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -551,17 +530,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -633,17 +609,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -713,17 +686,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -795,17 +765,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -881,12 +848,9 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -978,12 +942,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -1097,12 +1058,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -1279,14 +1237,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1454,14 +1409,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index aafa4a04a00de..647870f0e0897 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -20,9 +20,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -72,9 +69,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -91,9 +85,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,9 +140,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -172,9 +160,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -237,9 +222,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -255,9 +237,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -314,9 +293,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -336,9 +312,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -409,9 +382,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -434,9 +404,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -508,9 +475,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -559,9 +523,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -578,9 +539,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -631,9 +589,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -681,9 +636,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CIVI-NEXT: s_add_i32 s10, s10, s15 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -732,9 +684,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -753,9 +702,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -814,12 +760,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -836,12 +779,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -898,9 +838,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -919,9 +856,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -979,9 +913,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1000,9 +931,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1059,9 +987,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1080,9 +1005,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1139,9 +1061,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1160,9 +1079,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1219,9 +1135,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1240,9 +1153,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1299,9 +1209,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1320,9 +1227,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1379,9 +1283,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1400,9 +1301,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1465,9 +1363,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1487,9 +1382,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1554,12 +1446,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1578,12 +1467,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -1649,14 +1535,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1679,14 +1562,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1757,17 +1637,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -1779,12 +1656,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1844,17 +1718,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -1866,12 +1737,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1932,17 +1800,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -1954,12 +1819,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2019,17 +1881,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -2041,12 +1900,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2107,17 +1963,14 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -2129,12 +1982,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2200,9 +2050,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2230,9 +2077,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2316,12 +2160,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -2345,12 +2186,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2420,12 +2258,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -2443,12 +2278,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2509,12 +2341,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2532,12 +2361,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2629,12 +2455,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2686,12 +2509,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2836,14 +2656,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2866,12 +2683,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 @@ -2947,14 +2761,12 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2962,7 +2774,6 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_perm_b32 v3, s4, v3, v12 @@ -2976,14 +2787,11 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3120,14 +2928,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3220,14 +3025,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 1a32953305bbc..496a1c652da25 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,7 +7,7 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 14 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index c998a00727793..c201f84cac726 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,11 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s2, s12, 42 +; GCN-NEXT: s_add_i32 s2, s10, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -75,9 +72,6 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 44ea414dd4b93..39a3b1c8adc9f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -259,9 +259,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -272,9 +269,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -290,13 +284,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -305,13 +296,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -324,13 +312,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -339,14 +324,11 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -359,15 +341,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -375,15 +354,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -397,31 +373,25 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -435,16 +405,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -452,16 +419,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -475,16 +439,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -492,16 +453,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index ffceac2c912bb..24a332fa211c1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -179,9 +179,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -192,9 +189,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -210,13 +204,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -225,13 +216,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -244,13 +232,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -259,14 +244,11 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -280,9 +262,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -302,9 +281,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -335,9 +311,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -359,9 +332,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -395,9 +365,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -419,9 +386,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -455,15 +419,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -471,15 +432,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -496,14 +454,11 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; @@ -513,13 +468,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -533,17 +485,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -556,13 +505,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -577,17 +523,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -600,13 +543,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -621,31 +561,25 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -659,16 +593,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -676,16 +607,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -699,16 +627,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-SDAG-NEXT: s_add_i32 s10, s10, s15 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -716,16 +641,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-GISEL-NEXT: s_add_i32 s10, s10, s15 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index da40a06c306b9..9d93ca65683c4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -58,9 +55,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -108,9 +102,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -162,9 +153,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -222,9 +210,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -276,9 +261,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -336,9 +318,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -383,9 +362,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -436,9 +412,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -489,9 +462,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -548,14 +518,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -606,14 +573,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -669,9 +633,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -733,9 +694,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -808,9 +766,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -878,9 +833,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -957,9 +909,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -968,7 +918,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -1031,9 +980,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1043,7 +990,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1118,18 +1064,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: s_mov_b32 s4, m0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm ; @@ -1176,18 +1119,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: s_mov_b32 s4, m0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm ; @@ -1238,9 +1178,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1285,9 +1222,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1337,9 +1271,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1385,9 +1316,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1439,9 +1367,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1487,9 +1412,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1542,9 +1464,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1584,9 +1503,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1632,13 +1548,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1683,14 +1596,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1739,13 +1649,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1790,14 +1697,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1844,10 +1748,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1881,10 +1782,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1924,14 +1822,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1972,14 +1867,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -2025,14 +1917,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-SDAG-NEXT: s_add_i32 s10, s10, s15 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -2073,14 +1962,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_add_i32 s10, s10, s15 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 59cc6dfac1200..7202ab8b31466 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -96,9 +93,6 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 5d69aa7d679be..2ee1c60b4bbf2 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -26,9 +26,6 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -111,9 +108,6 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -187,9 +181,6 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -296,9 +287,6 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -375,9 +363,6 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -464,9 +449,6 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -609,9 +591,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -825,9 +804,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -905,9 +881,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -986,9 +959,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1066,9 +1036,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1145,9 +1112,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,9 +1199,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1329,9 +1290,6 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1431,9 +1389,6 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1538,9 +1493,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1649,9 +1601,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1771,9 +1720,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1934,9 +1880,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2116,9 +2059,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2383,9 +2323,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2694,9 +2631,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3178,9 +3112,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3752,9 +3683,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4678,9 +4606,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5469,9 +5394,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5560,9 +5482,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5650,9 +5569,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5736,9 +5652,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5825,15 +5738,12 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff @@ -5924,9 +5834,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -6032,13 +5939,10 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 @@ -6172,9 +6076,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6333,13 +6234,10 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 @@ -6556,9 +6454,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6822,13 +6717,10 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 @@ -7212,9 +7104,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7693,13 +7582,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16 ; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 @@ -8424,9 +8310,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 6283f6bb3c5e3..4ab55164e0999 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -23,9 +23,6 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -108,9 +105,6 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -200,9 +194,6 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -299,9 +290,6 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -402,9 +390,6 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -541,9 +526,6 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 @@ -707,9 +689,6 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 @@ -882,9 +861,6 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -1062,9 +1038,6 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 @@ -1248,9 +1221,6 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1439,9 +1409,6 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1528,9 +1495,6 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1623,9 +1587,6 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1712,9 +1673,6 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1809,15 +1767,12 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -1912,9 +1867,6 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -2031,16 +1983,13 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2169,9 +2118,6 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2336,10 +2282,8 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -2347,7 +2291,6 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 @@ -2548,9 +2491,6 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2849,9 +2789,6 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3308,16 +3245,13 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 @@ -3747,9 +3681,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4608,17 +4539,14 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 @@ -5232,9 +5160,6 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 45f0af8d423b6..46c7c2b08cd64 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -100,9 +97,6 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -189,9 +183,6 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -308,9 +299,6 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -442,9 +430,6 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -664,9 +649,6 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 43b79973187a6..ce17c81a24dd5 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -27,9 +27,6 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -117,9 +114,6 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -205,9 +199,6 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -320,9 +311,6 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -394,9 +382,6 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -473,9 +458,6 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -559,9 +541,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -639,9 +618,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -720,9 +696,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -800,9 +773,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -884,9 +854,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -988,9 +955,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1090,9 +1054,6 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1196,9 +1157,6 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1302,9 +1260,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1411,9 +1366,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1533,9 +1485,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1697,9 +1646,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1884,9 +1830,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2155,9 +2098,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2474,9 +2414,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2961,9 +2898,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3549,9 +3483,6 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,9 +4402,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5283,9 +5211,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5370,9 +5295,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5460,9 +5382,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5545,9 +5464,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5638,9 +5554,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5750,9 +5663,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5868,13 +5778,10 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 @@ -6011,9 +5918,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6175,13 +6079,10 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 @@ -6402,9 +6303,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6676,13 +6574,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24 @@ -7073,9 +6968,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7567,13 +7459,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 @@ -8316,9 +8205,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -9093,9 +8979,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9182,9 +9065,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9273,9 +9153,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9362,9 +9239,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9456,9 +9330,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9560,9 +9431,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9677,9 +9545,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9816,9 +9681,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9977,9 +9839,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10184,9 +10043,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10436,9 +10292,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10797,9 +10650,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11249,9 +11099,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -11916,9 +11763,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_add_i32 s10, s10, s15 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 6a973d0adeffa..e0c2d00891250 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -28,9 +28,6 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -136,9 +133,6 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -225,9 +219,6 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -359,9 +350,6 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -447,9 +435,6 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -538,9 +523,6 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -691,9 +673,6 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -843,9 +822,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -931,9 +907,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1022,9 +995,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1110,9 +1080,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1203,9 +1170,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1305,9 +1269,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1409,9 +1370,6 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1522,9 +1480,6 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1642,9 +1597,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1760,9 +1712,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1885,9 +1834,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2037,9 +1983,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2204,9 +2147,6 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2443,9 +2383,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2717,9 +2654,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3131,9 +3065,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3653,9 +3584,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4460,9 +4388,6 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5228,9 +5153,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5328,9 +5250,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5426,9 +5345,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5521,9 +5437,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5622,9 +5535,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5734,9 +5644,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5855,9 +5762,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6003,9 +5907,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6166,10 +6067,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6184,11 +6085,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4 @@ -6388,9 +6286,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6641,10 +6536,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6661,10 +6556,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 @@ -7024,9 +6916,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7498,9 +7387,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -8203,9 +8089,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 8322eee826495..4d7f1a9663c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -27,9 +27,6 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -109,9 +106,6 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -192,9 +186,6 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -279,9 +270,6 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -364,9 +352,6 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -473,9 +458,6 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -607,9 +589,6 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -740,9 +719,6 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -878,9 +854,6 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -1014,9 +987,6 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1164,9 +1134,6 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1250,9 +1217,6 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1337,9 +1301,6 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1423,9 +1384,6 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1513,9 +1471,6 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,9 +1568,6 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1718,10 +1670,8 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1729,7 +1679,6 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -1847,9 +1796,6 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1991,10 +1937,8 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2009,7 +1953,6 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -2187,9 +2130,6 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2426,9 +2366,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2790,10 +2727,8 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2827,7 +2762,6 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) @@ -3184,9 +3118,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3658,7 +3589,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15 +; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s13 ; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3978,9 +3909,6 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4505,9 +4433,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCNX3-HSA-NEXT: s_add_i32 s10, s10, s15 -; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 1a6fa3c518ca7..4dfc773d615e4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -13,8 +13,7 @@ ; GCN: s_cselect_b32 ; GCN-NOT: load_dword -; GCN: flat_load_dword -; GCN: flat_load_dword +; GCN: flat_load_dwordx2 ; GCN-NOT: load_dword ; GCN: flat_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 245a2775d9f2f..e876a8d9dda69 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index e8632871f56ea..92536c2078514 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s12, s12, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s12 +; GFX9-NEXT: s_mul_i32 s10, s10, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s10 ; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] @@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s12, s12, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0 +; GFX10-NEXT: s_mul_i32 s10, s10, s4 +; GFX10-NEXT: v_add3_u32 v0, s5, s10, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0348737a41a30..0a76e169e9c38 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -9,8 +9,6 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -188,7 +186,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -568,9 +566,7 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 @@ -985,8 +981,6 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 ; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -1261,8 +1255,6 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -1440,7 +1432,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -1820,9 +1812,7 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 @@ -2237,8 +2227,6 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 ; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 7868fa9a7ce4c..1c33d8a19890d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -217,9 +202,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX7-LABEL: flat_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -419,9 +389,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_load( ; GFX7-LABEL: flat_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -438,10 +405,6 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -460,10 +423,6 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -497,8 +456,6 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -514,8 +471,6 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -636,9 +591,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX7-LABEL: flat_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -656,10 +608,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -680,10 +628,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -720,8 +664,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -738,8 +680,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -879,9 +819,6 @@ entry: define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-LABEL: flat_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -893,10 +830,6 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -909,10 +842,6 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -936,8 +865,6 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -949,8 +876,6 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1034,9 +959,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-LABEL: flat_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,10 +970,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1064,10 +982,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1091,8 +1005,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1104,8 +1016,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1189,9 +1099,6 @@ entry: define amdgpu_kernel void @flat_agent_release_store( ; GFX7-LABEL: flat_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1204,10 +1111,6 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-WGP-LABEL: flat_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1222,10 +1125,6 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-CU-LABEL: flat_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1252,8 +1151,6 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1266,8 +1163,6 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1370,9 +1265,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-LABEL: flat_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1385,10 +1277,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1403,10 +1291,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1433,8 +1317,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1447,8 +1329,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1551,9 +1431,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1566,10 +1443,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1583,10 +1456,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1612,8 +1481,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1626,8 +1493,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1720,9 +1585,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1737,10 +1599,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1758,10 +1616,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1792,8 +1646,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1808,8 +1660,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1920,9 +1770,6 @@ entry: define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX7-LABEL: flat_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1936,10 +1783,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1955,10 +1798,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1987,8 +1826,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2002,8 +1839,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2115,9 +1950,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2133,10 +1965,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2156,10 +1984,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2193,8 +2017,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2210,8 +2032,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2341,9 +2161,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2359,10 +2176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2382,10 +2195,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2419,8 +2228,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2436,8 +2243,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2567,9 +2372,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2586,10 +2388,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2608,10 +2406,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2645,8 +2439,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2662,8 +2454,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2782,9 +2572,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2802,10 +2589,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2826,10 +2609,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2866,8 +2645,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2884,8 +2661,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3027,9 +2802,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3047,10 +2819,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3071,10 +2839,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3111,8 +2875,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3129,8 +2891,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3272,9 +3032,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3300,10 +3057,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3329,10 +3082,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3383,8 +3132,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3400,8 +3147,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3520,9 +3265,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3550,10 +3292,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3583,10 +3321,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3642,8 +3376,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3661,8 +3393,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3799,9 +3529,6 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3828,10 +3555,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3859,10 +3582,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3916,8 +3635,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3934,8 +3651,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4073,9 +3788,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4104,10 +3816,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4139,10 +3847,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4201,8 +3905,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4221,8 +3923,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4378,9 +4078,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4409,10 +4106,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4444,10 +4137,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4506,8 +4195,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4526,8 +4213,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4683,9 +4368,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4713,10 +4395,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4746,10 +4424,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4805,8 +4479,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4824,8 +4496,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4962,9 +4632,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4992,10 +4659,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5025,10 +4688,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5084,8 +4743,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5103,8 +4760,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5241,9 +4896,6 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5272,10 +4924,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5307,10 +4955,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5369,8 +5013,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5389,8 +5031,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5546,9 +5186,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5577,10 +5214,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5612,10 +5245,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5674,8 +5303,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5694,8 +5321,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5851,9 +5476,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5882,10 +5504,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5917,10 +5535,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5979,8 +5593,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5999,8 +5611,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6156,9 +5766,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6187,10 +5794,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6222,10 +5825,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6284,8 +5883,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6304,8 +5901,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6461,9 +6056,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6492,10 +6084,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6527,10 +6115,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6589,8 +6173,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6609,8 +6191,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6766,9 +6346,6 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6797,10 +6374,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6832,10 +6405,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6894,8 +6463,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6914,8 +6481,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7071,9 +6636,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7102,10 +6664,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7137,10 +6695,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7199,8 +6753,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7219,8 +6771,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7376,9 +6926,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7407,10 +6954,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7442,10 +6985,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7504,8 +7043,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7524,8 +7061,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7681,9 +7216,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7712,10 +7244,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7744,10 +7272,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7804,8 +7328,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7824,8 +7346,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7971,9 +7491,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8003,10 +7520,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8037,10 +7550,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8099,8 +7608,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8120,8 +7627,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8276,9 +7781,6 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8308,10 +7810,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8342,10 +7840,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8405,8 +7899,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8426,8 +7918,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8592,9 +8082,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8625,10 +8112,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8661,10 +8144,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8726,8 +8205,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8748,8 +8225,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8927,9 +8402,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8960,10 +8432,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8996,10 +8464,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9061,8 +8525,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9083,8 +8545,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9262,9 +8722,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9294,10 +8751,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9328,10 +8781,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9390,8 +8839,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9411,8 +8858,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9571,9 +9016,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9603,10 +9045,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9637,10 +9075,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9699,8 +9133,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9720,8 +9152,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9876,9 +9306,6 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9909,10 +9336,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9945,10 +9368,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10010,8 +9429,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10032,8 +9449,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10211,9 +9626,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10244,10 +9656,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10280,10 +9688,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10345,8 +9749,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10367,8 +9769,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10546,9 +9946,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10579,10 +9976,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10615,10 +10008,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10680,8 +10069,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10702,8 +10089,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10881,9 +10266,6 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10914,10 +10296,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10950,10 +10328,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11015,8 +10389,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11037,8 +10409,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11216,9 +10586,6 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11249,10 +10616,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11285,10 +10648,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11350,8 +10709,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11372,8 +10729,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11547,9 +10902,6 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11580,10 +10932,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11616,10 +10964,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11681,8 +11025,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11703,8 +11045,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11882,9 +11222,6 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11915,10 +11252,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11951,10 +11284,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12016,8 +11345,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12038,8 +11365,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12217,9 +11542,6 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -12250,10 +11572,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12286,10 +11604,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12351,8 +11665,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12373,8 +11685,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12552,9 +11862,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX7-LABEL: flat_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12570,10 +11877,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12590,10 +11893,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12625,8 +11924,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12641,8 +11938,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12754,9 +12049,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX7-LABEL: flat_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12772,10 +12064,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12792,10 +12080,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12827,8 +12111,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12843,8 +12125,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12956,9 +12236,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX7-LABEL: flat_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12976,10 +12253,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12999,10 +12272,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13038,8 +12307,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13056,8 +12323,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13183,9 +12448,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX7-LABEL: flat_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -13204,10 +12466,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13229,10 +12487,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13271,8 +12525,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13290,8 +12542,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13436,9 +12686,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-LABEL: flat_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13450,10 +12697,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13466,10 +12709,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13493,8 +12732,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13506,8 +12743,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13591,9 +12826,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-LABEL: flat_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13605,10 +12837,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13621,10 +12849,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13648,8 +12872,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13661,8 +12883,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13746,9 +12966,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-LABEL: flat_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13761,10 +12978,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13779,10 +12992,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13809,8 +13018,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13823,8 +13030,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13927,9 +13132,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-LABEL: flat_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13942,10 +13144,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13960,10 +13158,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13990,8 +13184,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14004,8 +13196,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14108,9 +13298,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14123,10 +13310,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14140,10 +13323,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14169,8 +13348,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14183,8 +13360,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14277,9 +13452,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14294,10 +13466,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14314,10 +13482,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14347,8 +13511,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14363,8 +13525,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14473,9 +13633,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14489,10 +13646,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14508,10 +13661,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14540,8 +13689,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14555,8 +13702,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14668,9 +13813,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14686,10 +13828,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14708,10 +13846,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14744,8 +13878,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14761,8 +13893,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14890,9 +14020,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14908,10 +14035,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14930,10 +14053,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14966,8 +14085,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14983,8 +14100,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -15112,9 +14227,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15132,10 +14244,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15155,10 +14263,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15194,8 +14298,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15212,8 +14314,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15337,9 +14437,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15358,10 +14455,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15383,10 +14476,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15425,8 +14514,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15444,8 +14531,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15592,9 +14677,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15613,10 +14695,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15638,10 +14716,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15680,8 +14754,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15699,8 +14771,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15847,9 +14917,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15875,10 +14942,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15904,10 +14967,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15958,8 +15017,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15975,8 +15032,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16095,9 +15150,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16125,10 +15177,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16157,10 +15205,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16215,8 +15259,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16234,8 +15276,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16370,9 +15410,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16399,10 +15436,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16430,10 +15463,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16487,8 +15516,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16505,8 +15532,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16644,9 +15669,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16675,10 +15697,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16709,10 +15727,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16770,8 +15784,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16790,8 +15802,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16945,9 +15955,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16976,10 +15983,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17010,10 +16013,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17071,8 +16070,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17091,8 +16088,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17246,9 +16241,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17276,10 +16268,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17308,10 +16296,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17366,8 +16350,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17385,8 +16367,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17521,9 +16501,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17551,10 +16528,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17583,10 +16556,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17641,8 +16610,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17660,8 +16627,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17796,9 +16761,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17827,10 +16789,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17861,10 +16819,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17922,8 +16876,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17942,8 +16894,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18097,9 +17047,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18128,10 +17075,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18162,10 +17105,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18223,8 +17162,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18243,8 +17180,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18398,9 +17333,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18429,10 +17361,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18463,10 +17391,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18524,8 +17448,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18544,8 +17466,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18699,9 +17619,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18730,10 +17647,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18764,10 +17677,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18825,8 +17734,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18845,8 +17752,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19000,9 +17905,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19031,10 +17933,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19065,10 +17963,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19126,8 +18020,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19146,8 +18038,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19301,9 +18191,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19332,10 +18219,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19366,10 +18249,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19427,8 +18306,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19447,8 +18324,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19602,9 +18477,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19633,10 +18505,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19667,10 +18535,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19728,8 +18592,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19748,8 +18610,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19903,9 +18763,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19934,10 +18791,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19968,10 +18821,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -20029,8 +18878,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20049,8 +18896,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20204,9 +19049,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20235,10 +19077,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20267,10 +19105,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20327,8 +19161,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20347,8 +19179,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20494,9 +19324,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20527,10 +19354,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20562,10 +19385,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20626,8 +19445,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20648,8 +19465,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20809,9 +19624,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20841,10 +19653,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20875,10 +19683,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20938,8 +19742,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20959,8 +19761,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21125,9 +19925,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21159,10 +19956,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21196,10 +19989,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21263,8 +20052,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21286,8 +20073,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21470,9 +20255,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21504,10 +20286,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21541,10 +20319,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21608,8 +20382,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21631,8 +20403,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21815,9 +20585,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21848,10 +20615,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21883,10 +20646,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21947,8 +20706,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21969,8 +20726,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22134,9 +20889,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22167,10 +20919,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22202,10 +20950,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22266,8 +21010,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22288,8 +21030,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22449,9 +21189,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22483,10 +21220,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22520,10 +21253,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22587,8 +21316,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22610,8 +21337,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22794,9 +21519,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22828,10 +21550,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22865,10 +21583,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22932,8 +21646,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22955,8 +21667,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23139,9 +21849,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23173,10 +21880,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23210,10 +21913,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23277,8 +21976,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23300,8 +21997,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23484,9 +22179,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23518,10 +22210,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23555,10 +22243,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23622,8 +22306,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23645,8 +22327,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23829,9 +22509,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23863,10 +22540,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23900,10 +22573,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23967,8 +22636,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23990,8 +22657,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24170,9 +22835,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -24204,10 +22866,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24241,10 +22899,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24308,8 +22962,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24331,8 +22983,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24515,9 +23165,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -24549,10 +23196,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24586,10 +23229,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24653,8 +23292,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24676,8 +23313,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24860,9 +23495,6 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -24894,10 +23526,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24931,10 +23559,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24998,8 +23622,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -25021,8 +23643,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index f189562bafe5f..b2340caa2933f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -217,9 +202,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -248,10 +230,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 @@ -279,10 +257,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 @@ -338,8 +312,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff @@ -369,8 +341,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff @@ -581,9 +551,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -599,10 +566,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -619,10 +582,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -654,8 +613,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -670,8 +627,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -783,9 +738,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -814,10 +766,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -846,10 +794,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -906,8 +850,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -938,8 +880,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1151,9 +1091,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX7-LABEL: flat_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -1170,10 +1107,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1191,10 +1124,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: flat_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1228,8 +1157,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1245,8 +1172,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 089d2a69facc1..304c80d7bb24d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -217,9 +202,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX7-LABEL: flat_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -419,9 +389,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-LABEL: flat_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -437,10 +404,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -457,10 +420,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -492,8 +451,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -508,8 +465,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -621,9 +576,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-LABEL: flat_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -639,10 +591,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -659,10 +607,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -694,8 +638,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -710,8 +652,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -823,9 +763,6 @@ entry: define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-LABEL: flat_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -837,10 +774,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -853,10 +786,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -880,8 +809,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -893,8 +820,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -978,9 +903,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-LABEL: flat_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -992,10 +914,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1008,10 +926,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1035,8 +949,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1048,8 +960,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1133,9 +1043,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-LABEL: flat_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,10 +1054,6 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1163,10 +1066,6 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1190,8 +1089,6 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1203,8 +1100,6 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1288,9 +1183,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-LABEL: flat_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,10 +1194,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1318,10 +1206,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1345,8 +1229,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1358,8 +1240,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1443,9 +1323,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1458,10 +1335,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1475,10 +1348,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1504,8 +1373,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1518,8 +1385,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1612,9 +1477,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1627,10 +1489,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1644,10 +1502,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1673,8 +1527,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1687,8 +1539,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1781,9 +1631,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1796,10 +1643,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1813,10 +1656,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1842,8 +1681,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1856,8 +1693,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1950,9 +1785,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1965,10 +1797,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1982,10 +1810,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2011,8 +1835,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2025,8 +1847,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2119,9 +1939,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2134,10 +1951,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2151,10 +1964,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2180,8 +1989,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2194,8 +2001,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2288,9 +2093,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2306,10 +2108,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2326,10 +2124,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2361,8 +2155,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2377,8 +2169,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2488,9 +2278,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2506,10 +2293,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2526,10 +2309,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2561,8 +2340,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2577,8 +2354,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2688,9 +2463,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2706,10 +2478,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2726,10 +2494,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2761,8 +2525,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2777,8 +2539,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2888,9 +2648,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2916,10 +2673,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2945,10 +2698,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2999,8 +2748,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3016,8 +2763,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3136,9 +2881,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3164,10 +2906,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3193,10 +2931,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3247,8 +2981,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3264,8 +2996,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3384,9 +3114,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3412,10 +3139,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3441,10 +3164,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3495,8 +3214,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3512,8 +3229,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3632,9 +3347,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3660,10 +3372,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3689,10 +3397,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3743,8 +3447,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3760,8 +3462,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3880,9 +3580,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3908,10 +3605,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3937,10 +3630,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3991,8 +3680,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4008,8 +3695,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4128,9 +3813,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4156,10 +3838,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4185,10 +3863,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4239,8 +3913,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4256,8 +3928,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4376,9 +4046,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4404,10 +4071,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4433,10 +4096,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4487,8 +4146,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4504,8 +4161,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4624,9 +4279,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4652,10 +4304,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4681,10 +4329,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4735,8 +4379,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4752,8 +4394,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4872,9 +4512,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4900,10 +4537,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4929,10 +4562,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4983,8 +4612,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5000,8 +4627,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5120,9 +4745,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5148,10 +4770,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5177,10 +4795,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5231,8 +4845,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5248,8 +4860,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5368,9 +4978,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5396,10 +5003,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5425,10 +5028,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5479,8 +5078,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5496,8 +5093,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5616,9 +5211,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5644,10 +5236,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5673,10 +5261,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5727,8 +5311,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5744,8 +5326,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5864,9 +5444,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5892,10 +5469,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5921,10 +5494,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5975,8 +5544,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5992,8 +5559,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6112,9 +5677,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6140,10 +5702,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6169,10 +5727,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6223,8 +5777,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6240,8 +5792,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6360,9 +5910,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6388,10 +5935,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6417,10 +5960,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6471,8 +6010,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6488,8 +6025,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6608,9 +6143,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6639,10 +6171,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6671,10 +6199,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6731,8 +6255,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6751,8 +6273,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6898,9 +6418,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6929,10 +6446,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6961,10 +6474,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7021,8 +6530,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7041,8 +6548,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7188,9 +6693,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7219,10 +6721,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7251,10 +6749,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7311,8 +6805,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7331,8 +6823,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7478,9 +6968,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7509,10 +6996,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7541,10 +7024,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7601,8 +7080,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7621,8 +7098,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7768,9 +7243,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7799,10 +7271,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7831,10 +7299,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7891,8 +7355,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7911,8 +7373,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8058,9 +7518,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8089,10 +7546,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8121,10 +7574,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8181,8 +7630,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8201,8 +7648,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8348,9 +7793,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8379,10 +7821,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8411,10 +7849,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8471,8 +7905,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8491,8 +7923,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8638,9 +8068,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8669,10 +8096,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8701,10 +8124,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8761,8 +8180,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8781,8 +8198,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8928,9 +8343,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8959,10 +8371,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8991,10 +8399,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9051,8 +8455,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9071,8 +8473,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9218,9 +8618,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9249,10 +8646,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9281,10 +8674,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9341,8 +8730,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9361,8 +8748,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9508,9 +8893,6 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9539,10 +8921,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9571,10 +8949,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9631,8 +9005,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9651,8 +9023,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9798,9 +9168,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9829,10 +9196,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9861,10 +9224,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9921,8 +9280,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9941,8 +9298,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10088,9 +9443,6 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10119,10 +9471,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10151,10 +9499,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10211,8 +9555,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10231,8 +9573,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10378,9 +9718,6 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10409,10 +9746,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10441,10 +9774,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10501,8 +9830,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10521,8 +9848,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10668,9 +9993,6 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10699,10 +10021,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10731,10 +10049,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10791,8 +10105,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10811,8 +10123,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10958,9 +10268,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX7-LABEL: flat_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10976,10 +10283,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10996,10 +10299,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11031,8 +10330,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11047,8 +10344,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11160,9 +10455,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11178,10 +10470,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11198,10 +10486,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11233,8 +10517,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11249,8 +10531,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11362,9 +10642,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-LABEL: flat_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11380,10 +10657,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11400,10 +10673,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11435,8 +10704,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11451,8 +10718,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11564,9 +10829,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11582,10 +10844,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11602,10 +10860,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11637,8 +10891,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11653,8 +10905,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11766,9 +11016,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-LABEL: flat_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11780,10 +11027,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11796,10 +11039,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11823,8 +11062,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11836,8 +11073,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11921,9 +11156,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11935,10 +11167,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11951,10 +11179,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11978,8 +11202,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11991,8 +11213,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12076,9 +11296,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-LABEL: flat_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12090,10 +11307,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12106,10 +11319,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12133,8 +11342,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12146,8 +11353,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12231,9 +11436,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12245,10 +11447,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12261,10 +11459,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12288,8 +11482,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12301,8 +11493,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12386,9 +11576,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12401,10 +11588,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12418,10 +11601,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12447,8 +11626,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12461,8 +11638,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12555,9 +11730,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12570,10 +11742,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12587,10 +11755,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12616,8 +11780,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12630,8 +11792,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12724,9 +11884,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12739,10 +11896,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12756,10 +11909,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12785,8 +11934,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12799,8 +11946,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12893,9 +12038,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12908,10 +12050,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12925,10 +12063,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12954,8 +12088,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12968,8 +12100,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13062,9 +12192,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13077,10 +12204,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13094,10 +12217,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13123,8 +12242,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13137,8 +12254,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13231,9 +12346,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13249,10 +12361,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13269,10 +12377,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13304,8 +12408,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13320,8 +12422,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13431,9 +12531,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13449,10 +12546,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13469,10 +12562,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13504,8 +12593,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13520,8 +12607,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13631,9 +12716,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13649,10 +12731,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13669,10 +12747,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13704,8 +12778,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13720,8 +12792,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13831,9 +12901,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13859,10 +12926,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13888,10 +12951,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13942,8 +13001,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13959,8 +13016,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14079,9 +13134,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14107,10 +13159,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14136,10 +13184,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14190,8 +13234,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14207,8 +13249,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14327,9 +13367,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14355,10 +13392,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14384,10 +13417,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14438,8 +13467,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14455,8 +13482,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14575,9 +13600,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14603,10 +13625,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14632,10 +13650,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14686,8 +13700,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14703,8 +13715,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14823,9 +13833,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14851,10 +13858,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14880,10 +13883,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14934,8 +13933,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14951,8 +13948,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15071,9 +14066,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15099,10 +14091,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15128,10 +14116,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15182,8 +14166,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15199,8 +14181,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15319,9 +14299,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15347,10 +14324,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15376,10 +14349,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15430,8 +14399,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15447,8 +14414,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15567,9 +14532,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15595,10 +14557,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15624,10 +14582,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15678,8 +14632,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15695,8 +14647,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15815,9 +14765,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15843,10 +14790,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15872,10 +14815,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15926,8 +14865,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15943,8 +14880,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16063,9 +14998,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16091,10 +15023,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16120,10 +15048,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16174,8 +15098,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16191,8 +15113,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16311,9 +15231,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16339,10 +15256,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16368,10 +15281,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16422,8 +15331,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16439,8 +15346,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16559,9 +15464,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16587,10 +15489,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16616,10 +15514,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16670,8 +15564,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16687,8 +15579,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16807,9 +15697,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16835,10 +15722,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16864,10 +15747,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16918,8 +15797,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16935,8 +15812,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17055,9 +15930,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17083,10 +15955,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17112,10 +15980,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17166,8 +16030,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17183,8 +16045,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17303,9 +16163,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17331,10 +16188,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17360,10 +16213,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17414,8 +16263,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17431,8 +16278,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17551,9 +16396,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17582,10 +16424,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17614,10 +16452,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17674,8 +16508,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17694,8 +16526,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17841,9 +16671,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17872,10 +16699,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17904,10 +16727,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17964,8 +16783,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17984,8 +16801,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18131,9 +16946,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18162,10 +16974,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18194,10 +17002,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18254,8 +17058,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18274,8 +17076,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18421,9 +17221,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18452,10 +17249,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18484,10 +17277,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18544,8 +17333,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18564,8 +17351,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18711,9 +17496,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18742,10 +17524,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18774,10 +17552,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18834,8 +17608,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18854,8 +17626,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19001,9 +17771,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19032,10 +17799,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19064,10 +17827,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19124,8 +17883,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19144,8 +17901,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19291,9 +18046,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19322,10 +18074,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19354,10 +18102,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19414,8 +18158,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19434,8 +18176,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19581,9 +18321,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19612,10 +18349,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19644,10 +18377,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19704,8 +18433,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19724,8 +18451,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19871,9 +18596,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19902,10 +18624,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19934,10 +18652,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19994,8 +18708,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20014,8 +18726,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20161,9 +18871,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20192,10 +18899,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20224,10 +18927,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20284,8 +18983,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20304,8 +19001,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20451,9 +19146,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20482,10 +19174,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20514,10 +19202,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20574,8 +19258,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20594,8 +19276,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20741,9 +19421,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20772,10 +19449,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20804,10 +19477,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20864,8 +19533,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20884,8 +19551,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21031,9 +19696,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21062,10 +19724,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21094,10 +19752,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21154,8 +19808,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21174,8 +19826,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21321,9 +19971,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21352,10 +19999,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21384,10 +20027,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21444,8 +20083,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21464,8 +20101,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21611,9 +20246,6 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21642,10 +20274,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21674,10 +20302,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21734,8 +20358,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21754,8 +20376,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 69c03ba6a3979..038b58deb0cf1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-CU-LABEL: flat_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -217,9 +202,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_load( ; GFX7-LABEL: flat_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -419,9 +389,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_load( ; GFX7-LABEL: flat_system_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -438,10 +405,6 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -460,10 +423,6 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-CU-LABEL: flat_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -497,8 +456,6 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -515,8 +472,6 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -638,9 +593,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX7-LABEL: flat_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -658,10 +610,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -682,10 +630,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -722,8 +666,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -741,8 +683,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -883,9 +823,6 @@ entry: define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-LABEL: flat_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -897,10 +834,6 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -913,10 +846,6 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-CU-LABEL: flat_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -940,8 +869,6 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -953,8 +880,6 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1038,9 +963,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-LABEL: flat_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1052,10 +974,6 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1068,10 +986,6 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1095,8 +1009,6 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1108,8 +1020,6 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1193,9 +1103,6 @@ entry: define amdgpu_kernel void @flat_system_release_store( ; GFX7-LABEL: flat_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1208,10 +1115,6 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-WGP-LABEL: flat_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1226,10 +1129,6 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-CU-LABEL: flat_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1256,8 +1155,6 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1271,8 +1168,6 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1376,9 +1271,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-LABEL: flat_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1391,10 +1283,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1409,10 +1297,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1439,8 +1323,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1454,8 +1336,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1559,9 +1439,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1574,10 +1451,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1591,10 +1464,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1620,8 +1489,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1634,8 +1501,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1728,9 +1593,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX7-LABEL: flat_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1745,10 +1607,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1766,10 +1624,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1800,8 +1654,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1817,8 +1669,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1930,9 +1780,6 @@ entry: define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX7-LABEL: flat_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1946,10 +1793,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1965,10 +1808,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1997,8 +1836,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2013,8 +1850,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2127,9 +1962,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2145,10 +1977,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2168,10 +1996,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2205,8 +2029,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2224,8 +2046,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2357,9 +2177,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2375,10 +2192,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2398,10 +2211,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2435,8 +2244,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2454,8 +2261,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2587,9 +2392,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,10 +2408,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2628,10 +2426,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2665,8 +2459,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2683,8 +2475,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2804,9 +2594,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2824,10 +2611,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2848,10 +2631,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2888,8 +2667,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2908,8 +2685,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3053,9 +2828,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3073,10 +2845,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3097,10 +2865,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3137,8 +2901,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3157,8 +2919,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3302,9 +3062,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3330,10 +3087,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3359,10 +3112,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3413,8 +3162,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3430,8 +3177,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3550,9 +3295,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3580,10 +3322,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3613,10 +3351,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3672,8 +3406,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3692,8 +3424,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3831,9 +3561,6 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3860,10 +3587,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3891,10 +3614,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3948,8 +3667,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3967,8 +3684,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4107,9 +3822,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4138,10 +3850,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4173,10 +3881,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4235,8 +3939,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4257,8 +3959,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4416,9 +4116,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4447,10 +4144,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4482,10 +4175,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4544,8 +4233,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4566,8 +4253,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4725,9 +4410,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4755,10 +4437,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4788,10 +4466,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4847,8 +4521,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4867,8 +4539,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5006,9 +4676,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5036,10 +4703,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5069,10 +4732,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5128,8 +4787,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5148,8 +4805,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5287,9 +4942,6 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5318,10 +4970,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5353,10 +5001,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5415,8 +5059,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5437,8 +5079,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5596,9 +5236,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5627,10 +5264,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5662,10 +5295,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5724,8 +5353,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5746,8 +5373,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5905,9 +5530,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5936,10 +5558,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5971,10 +5589,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6033,8 +5647,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6055,8 +5667,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6214,9 +5824,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6245,10 +5852,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6280,10 +5883,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6342,8 +5941,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6364,8 +5961,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6523,9 +6118,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6554,10 +6146,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6589,10 +6177,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6651,8 +6235,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6673,8 +6255,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6832,9 +6412,6 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6863,10 +6440,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6898,10 +6471,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6960,8 +6529,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6982,8 +6549,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7141,9 +6706,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7172,10 +6734,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7207,10 +6765,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7269,8 +6823,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7291,8 +6843,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7450,9 +7000,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7481,10 +7028,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7516,10 +7059,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -7578,8 +7117,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7600,8 +7137,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7759,9 +7294,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7790,10 +7322,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7822,10 +7350,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7882,8 +7406,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7902,8 +7424,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8049,9 +7569,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8081,10 +7598,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8115,10 +7628,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8177,8 +7686,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8199,8 +7706,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8356,9 +7861,6 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8388,10 +7890,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8422,10 +7920,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8485,8 +7979,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8507,8 +7999,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8674,9 +8164,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8707,10 +8194,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8743,10 +8226,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8808,8 +8287,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8832,8 +8309,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9013,9 +8488,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9046,10 +8518,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9082,10 +8550,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9147,8 +8611,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9171,8 +8633,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9352,9 +8812,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9384,10 +8841,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9418,10 +8871,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9480,8 +8929,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9502,8 +8949,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9663,9 +9108,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9695,10 +9137,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9729,10 +9167,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9791,8 +9225,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9813,8 +9245,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9970,9 +9400,6 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10003,10 +9430,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10039,10 +9462,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10104,8 +9523,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10128,8 +9545,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10309,9 +9724,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10342,10 +9754,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10378,10 +9786,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10443,8 +9847,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10467,8 +9869,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10648,9 +10048,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10681,10 +10078,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10717,10 +10110,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10782,8 +10171,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10806,8 +10193,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10987,9 +10372,6 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11020,10 +10402,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11056,10 +10434,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11121,8 +10495,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11145,8 +10517,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11326,9 +10696,6 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11359,10 +10726,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11395,10 +10758,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11460,8 +10819,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11484,8 +10841,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11661,9 +11016,6 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11694,10 +11046,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11730,10 +11078,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11795,8 +11139,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11819,8 +11161,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12000,9 +11340,6 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -12033,10 +11370,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12069,10 +11402,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12134,8 +11463,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12158,8 +11485,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12339,9 +11664,6 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -12372,10 +11694,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12408,10 +11726,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -12473,8 +11787,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12497,8 +11809,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12678,9 +11988,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX7-LABEL: flat_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12696,10 +12003,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12716,10 +12019,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12751,8 +12050,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12767,8 +12064,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12880,9 +12175,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX7-LABEL: flat_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12898,10 +12190,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12918,10 +12206,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12953,8 +12237,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12969,8 +12251,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13082,9 +12362,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX7-LABEL: flat_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -13102,10 +12379,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13125,10 +12398,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13164,8 +12433,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13183,8 +12450,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13311,9 +12576,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX7-LABEL: flat_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -13332,10 +12594,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13357,10 +12615,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13399,8 +12653,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13419,8 +12671,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13566,9 +12816,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-LABEL: flat_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13580,10 +12827,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13596,10 +12839,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13623,8 +12862,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13636,8 +12873,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13721,9 +12956,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-LABEL: flat_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,10 +12967,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13751,10 +12979,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13778,8 +13002,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13791,8 +13013,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13876,9 +13096,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-LABEL: flat_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13891,10 +13108,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13909,10 +13122,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-CU-LABEL: flat_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13939,8 +13148,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -13954,8 +13161,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14059,9 +13264,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-LABEL: flat_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14074,10 +13276,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14092,10 +13290,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14122,8 +13316,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14137,8 +13329,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -14242,9 +13432,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14257,10 +13444,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14274,10 +13457,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14303,8 +13482,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14317,8 +13494,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14411,9 +13586,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14428,10 +13600,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14448,10 +13616,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14481,8 +13645,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14498,8 +13660,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14609,9 +13769,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX7-LABEL: flat_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14625,10 +13782,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14644,10 +13797,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14676,8 +13825,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14692,8 +13839,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -14806,9 +13951,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14824,10 +13966,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -14846,10 +13984,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -14882,8 +14016,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -14901,8 +14033,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -15032,9 +14162,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -15050,10 +14177,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -15072,10 +14195,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -15108,8 +14227,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -15127,8 +14244,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -15258,9 +14373,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15278,10 +14390,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15301,10 +14409,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15340,8 +14444,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15359,8 +14461,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15485,9 +14585,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15506,10 +14603,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15531,10 +14624,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15573,8 +14662,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15594,8 +14681,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15744,9 +14829,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15765,10 +14847,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15790,10 +14868,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15832,8 +14906,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -15853,8 +14925,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -16003,9 +15073,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16031,10 +15098,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16060,10 +15123,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16114,8 +15173,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16131,8 +15188,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16251,9 +15306,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16281,10 +15333,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16313,10 +15361,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16371,8 +15415,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16391,8 +15433,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16528,9 +15568,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16557,10 +15594,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16588,10 +15621,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16645,8 +15674,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16664,8 +15691,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16804,9 +15829,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16835,10 +15857,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16869,10 +15887,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16930,8 +15944,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16952,8 +15964,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17109,9 +16119,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17140,10 +16147,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17174,10 +16177,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17235,8 +16234,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17257,8 +16254,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17414,9 +16409,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17444,10 +16436,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17476,10 +16464,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17534,8 +16518,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17554,8 +16536,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17691,9 +16671,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17721,10 +16698,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17753,10 +16726,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17811,8 +16780,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17831,8 +16798,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17968,9 +16933,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17999,10 +16961,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18033,10 +16991,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18094,8 +17048,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18116,8 +17068,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18273,9 +17223,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18304,10 +17251,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18338,10 +17281,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18399,8 +17338,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18421,8 +17358,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18578,9 +17513,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18609,10 +17541,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18643,10 +17571,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18704,8 +17628,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18726,8 +17648,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18883,9 +17803,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18914,10 +17831,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18948,10 +17861,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19009,8 +17918,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19031,8 +17938,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19188,9 +18093,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19219,10 +18121,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19253,10 +18151,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19314,8 +18208,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19336,8 +18228,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19493,9 +18383,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19524,10 +18411,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19558,10 +18441,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19619,8 +18498,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19641,8 +18518,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19798,9 +18673,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19829,10 +18701,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19863,10 +18731,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -19924,8 +18788,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19946,8 +18808,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20103,9 +18963,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -20134,10 +18991,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -20168,10 +19021,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -20229,8 +19078,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20251,8 +19098,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20408,9 +19253,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20439,10 +19281,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20471,10 +19309,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20531,8 +19365,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20551,8 +19383,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20698,9 +19528,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20731,10 +19558,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20766,10 +19589,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20830,8 +19649,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20853,8 +19670,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21015,9 +19830,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21047,10 +19859,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21081,10 +19889,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21144,8 +19948,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21166,8 +19968,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21333,9 +20133,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21367,10 +20164,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21404,10 +20197,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21471,8 +20260,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21496,8 +20283,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21682,9 +20467,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21716,10 +20498,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21753,10 +20531,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21820,8 +20594,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21845,8 +20617,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22031,9 +20801,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22064,10 +20831,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22099,10 +20862,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22163,8 +20922,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22186,8 +20943,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22352,9 +21107,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22385,10 +21137,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22420,10 +21168,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22484,8 +21228,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22507,8 +21249,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22669,9 +21409,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22703,10 +21440,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22740,10 +21473,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22807,8 +21536,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22832,8 +21559,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23018,9 +21743,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23052,10 +21774,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23089,10 +21807,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23156,8 +21870,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23181,8 +21893,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23367,9 +22077,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23401,10 +22108,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23438,10 +22141,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23505,8 +22204,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23530,8 +22227,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23716,9 +22411,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23750,10 +22442,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23787,10 +22475,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23854,8 +22538,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -23879,8 +22561,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24065,9 +22745,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -24099,10 +22776,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24136,10 +22809,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24203,8 +22872,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24228,8 +22895,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24410,9 +23075,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -24444,10 +23106,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24481,10 +23139,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24548,8 +23202,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24573,8 +23225,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24759,9 +23409,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -24793,10 +23440,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24830,10 +23473,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -24897,8 +23536,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -24922,8 +23559,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -25108,9 +23743,6 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -25142,10 +23774,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -25179,10 +23807,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -25246,8 +23870,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -25271,8 +23893,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 5f788e2e41ac5..f1b465c1789da 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -11,9 +11,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -30,10 +27,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -51,10 +44,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -165,9 +154,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -197,10 +183,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 @@ -229,10 +211,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 @@ -433,9 +411,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -452,10 +427,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -473,10 +444,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -591,9 +558,6 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -623,10 +587,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -656,10 +616,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -865,9 +821,6 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-LABEL: flat_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -884,10 +837,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -905,10 +854,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1015,9 +960,6 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-LABEL: flat_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1030,10 +972,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1048,10 +986,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index dad713198cc89..23982f8a00cdb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -217,9 +202,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX7-LABEL: flat_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -419,9 +389,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-LABEL: flat_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -437,10 +404,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -457,10 +420,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -492,8 +451,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -508,8 +465,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -621,9 +576,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-LABEL: flat_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -639,10 +591,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -659,10 +607,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -694,8 +638,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -710,8 +652,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -823,9 +763,6 @@ entry: define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-LABEL: flat_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -837,10 +774,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -853,10 +786,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -880,8 +809,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -893,8 +820,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -978,9 +903,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-LABEL: flat_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -992,10 +914,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1008,10 +926,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1035,8 +949,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1048,8 +960,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1133,9 +1043,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-LABEL: flat_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,10 +1054,6 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1163,10 +1066,6 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1190,8 +1089,6 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1203,8 +1100,6 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1288,9 +1183,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-LABEL: flat_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,10 +1194,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1318,10 +1206,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1345,8 +1229,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1358,8 +1240,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1443,9 +1323,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1458,10 +1335,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1475,10 +1348,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1504,8 +1373,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1518,8 +1385,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1612,9 +1477,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1627,10 +1489,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1644,10 +1502,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1673,8 +1527,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1687,8 +1539,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1781,9 +1631,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1796,10 +1643,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1813,10 +1656,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1842,8 +1681,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1856,8 +1693,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1950,9 +1785,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1965,10 +1797,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1982,10 +1810,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2011,8 +1835,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2025,8 +1847,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2119,9 +1939,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2134,10 +1951,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2151,10 +1964,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2180,8 +1989,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2194,8 +2001,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2288,9 +2093,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2306,10 +2108,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2326,10 +2124,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2361,8 +2155,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2377,8 +2169,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2488,9 +2278,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2506,10 +2293,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2526,10 +2309,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2561,8 +2340,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2577,8 +2354,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2688,9 +2463,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2706,10 +2478,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2726,10 +2494,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2761,8 +2525,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2777,8 +2539,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2888,9 +2648,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2916,10 +2673,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2945,10 +2698,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2999,8 +2748,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3016,8 +2763,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3136,9 +2881,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3164,10 +2906,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3193,10 +2931,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3247,8 +2981,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3264,8 +2996,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3384,9 +3114,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3412,10 +3139,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3441,10 +3164,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3495,8 +3214,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3512,8 +3229,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3632,9 +3347,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3660,10 +3372,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3689,10 +3397,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3743,8 +3447,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3760,8 +3462,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3880,9 +3580,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3908,10 +3605,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3937,10 +3630,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3991,8 +3680,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4008,8 +3695,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4128,9 +3813,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4156,10 +3838,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4185,10 +3863,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4239,8 +3913,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4256,8 +3928,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4376,9 +4046,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4404,10 +4071,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4433,10 +4096,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4487,8 +4146,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4504,8 +4161,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4624,9 +4279,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4652,10 +4304,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4681,10 +4329,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4735,8 +4379,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4752,8 +4394,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4872,9 +4512,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4900,10 +4537,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4929,10 +4562,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4983,8 +4612,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5000,8 +4627,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5120,9 +4745,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5148,10 +4770,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5177,10 +4795,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5231,8 +4845,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5248,8 +4860,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5368,9 +4978,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5396,10 +5003,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5425,10 +5028,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5479,8 +5078,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5496,8 +5093,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5616,9 +5211,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5644,10 +5236,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5673,10 +5261,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5727,8 +5311,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5744,8 +5326,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5864,9 +5444,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5892,10 +5469,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5921,10 +5494,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5975,8 +5544,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5992,8 +5559,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6112,9 +5677,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6140,10 +5702,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6169,10 +5727,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6223,8 +5777,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6240,8 +5792,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6360,9 +5910,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6388,10 +5935,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6417,10 +5960,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6471,8 +6010,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6488,8 +6025,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6608,9 +6143,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6639,10 +6171,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6671,10 +6199,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6731,8 +6255,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6751,8 +6273,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6898,9 +6418,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6929,10 +6446,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6961,10 +6474,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7021,8 +6530,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7041,8 +6548,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7188,9 +6693,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7219,10 +6721,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7251,10 +6749,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7311,8 +6805,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7331,8 +6823,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7478,9 +6968,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7509,10 +6996,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7541,10 +7024,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7601,8 +7080,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7621,8 +7098,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7768,9 +7243,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7799,10 +7271,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7831,10 +7299,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7891,8 +7355,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7911,8 +7373,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8058,9 +7518,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8089,10 +7546,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8121,10 +7574,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8181,8 +7630,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8201,8 +7648,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8348,9 +7793,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8379,10 +7821,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8411,10 +7849,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8471,8 +7905,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8491,8 +7923,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8638,9 +8068,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8669,10 +8096,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8701,10 +8124,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8761,8 +8180,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8781,8 +8198,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8928,9 +8343,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8959,10 +8371,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8991,10 +8399,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9051,8 +8455,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9071,8 +8473,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9218,9 +8618,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9249,10 +8646,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9281,10 +8674,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9341,8 +8730,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9361,8 +8748,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9508,9 +8893,6 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9539,10 +8921,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9571,10 +8949,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9631,8 +9005,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9651,8 +9023,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9798,9 +9168,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9829,10 +9196,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9861,10 +9224,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9921,8 +9280,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9941,8 +9298,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10088,9 +9443,6 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10119,10 +9471,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10151,10 +9499,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10211,8 +9555,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10231,8 +9573,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10378,9 +9718,6 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10409,10 +9746,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10441,10 +9774,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10501,8 +9830,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10521,8 +9848,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10668,9 +9993,6 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10699,10 +10021,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10731,10 +10049,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10791,8 +10105,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10811,8 +10123,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10958,9 +10268,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX7-LABEL: flat_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10976,10 +10283,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10996,10 +10299,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11031,8 +10330,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11047,8 +10344,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11160,9 +10455,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11178,10 +10470,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11198,10 +10486,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11233,8 +10517,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11249,8 +10531,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11362,9 +10642,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-LABEL: flat_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11380,10 +10657,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11400,10 +10673,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11435,8 +10704,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11451,8 +10718,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11564,9 +10829,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11582,10 +10844,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11602,10 +10860,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11637,8 +10891,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11653,8 +10905,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11766,9 +11016,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11780,10 +11027,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11796,10 +11039,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11823,8 +11062,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11836,8 +11073,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11921,9 +11156,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11935,10 +11167,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11951,10 +11179,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11978,8 +11202,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11991,8 +11213,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12076,9 +11296,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-LABEL: flat_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12090,10 +11307,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12106,10 +11319,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12133,8 +11342,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12146,8 +11353,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12231,9 +11436,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12245,10 +11447,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12261,10 +11459,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12288,8 +11482,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12301,8 +11493,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12386,9 +11576,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12401,10 +11588,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12418,10 +11601,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12447,8 +11626,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12461,8 +11638,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12555,9 +11730,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12570,10 +11742,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12587,10 +11755,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12616,8 +11780,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12630,8 +11792,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12724,9 +11884,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12739,10 +11896,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12756,10 +11909,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12785,8 +11934,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12799,8 +11946,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12893,9 +12038,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12908,10 +12050,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12925,10 +12063,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12954,8 +12088,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12968,8 +12100,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13062,9 +12192,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13077,10 +12204,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13094,10 +12217,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13123,8 +12242,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13137,8 +12254,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13231,9 +12346,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13249,10 +12361,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13269,10 +12377,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13304,8 +12408,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13320,8 +12422,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13431,9 +12531,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13449,10 +12546,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13469,10 +12562,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13504,8 +12593,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13520,8 +12607,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13631,9 +12716,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13649,10 +12731,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13669,10 +12747,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13704,8 +12778,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13720,8 +12792,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13831,9 +12901,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13859,10 +12926,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13888,10 +12951,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13942,8 +13001,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13959,8 +13016,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14079,9 +13134,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14107,10 +13159,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14136,10 +13184,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14190,8 +13234,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14207,8 +13249,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14327,9 +13367,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14355,10 +13392,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14384,10 +13417,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14438,8 +13467,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14455,8 +13482,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14575,9 +13600,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14603,10 +13625,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14632,10 +13650,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14686,8 +13700,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14703,8 +13715,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14823,9 +13833,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14851,10 +13858,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14880,10 +13883,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14934,8 +13933,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14951,8 +13948,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15071,9 +14066,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15099,10 +14091,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15128,10 +14116,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15182,8 +14166,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15199,8 +14181,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15319,9 +14299,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15347,10 +14324,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15376,10 +14349,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15430,8 +14399,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15447,8 +14414,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15567,9 +14532,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15595,10 +14557,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15624,10 +14582,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15678,8 +14632,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15695,8 +14647,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15815,9 +14765,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15843,10 +14790,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15872,10 +14815,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15926,8 +14865,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15943,8 +14880,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16063,9 +14998,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16091,10 +15023,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16120,10 +15048,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16174,8 +15098,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16191,8 +15113,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16311,9 +15231,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16339,10 +15256,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16368,10 +15281,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16422,8 +15331,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16439,8 +15346,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16559,9 +15464,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16587,10 +15489,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16616,10 +15514,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16670,8 +15564,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16687,8 +15579,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16807,9 +15697,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16835,10 +15722,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16864,10 +15747,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16918,8 +15797,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16935,8 +15812,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17055,9 +15930,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17083,10 +15955,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17112,10 +15980,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17166,8 +16030,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17183,8 +16045,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17303,9 +16163,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17331,10 +16188,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17360,10 +16213,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17414,8 +16263,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17431,8 +16278,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17551,9 +16396,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17582,10 +16424,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17614,10 +16452,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17674,8 +16508,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17694,8 +16526,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17841,9 +16671,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17872,10 +16699,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17904,10 +16727,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17964,8 +16783,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17984,8 +16801,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18131,9 +16946,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18162,10 +16974,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18194,10 +17002,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18254,8 +17058,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18274,8 +17076,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18421,9 +17221,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18452,10 +17249,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18484,10 +17277,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18544,8 +17333,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18564,8 +17351,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18711,9 +17496,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18742,10 +17524,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18774,10 +17552,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18834,8 +17608,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18854,8 +17626,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19001,9 +17771,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19032,10 +17799,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19064,10 +17827,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19124,8 +17883,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19144,8 +17901,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19291,9 +18046,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19322,10 +18074,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19354,10 +18102,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19414,8 +18158,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19434,8 +18176,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19581,9 +18321,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19612,10 +18349,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19644,10 +18377,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19704,8 +18433,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19724,8 +18451,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19871,9 +18596,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19902,10 +18624,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19934,10 +18652,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19994,8 +18708,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20014,8 +18726,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20161,9 +18871,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20192,10 +18899,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20224,10 +18927,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20284,8 +18983,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20304,8 +19001,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20451,9 +19146,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20482,10 +19174,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20514,10 +19202,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20574,8 +19258,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20594,8 +19276,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20741,9 +19421,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20772,10 +19449,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20804,10 +19477,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20864,8 +19533,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20884,8 +19551,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21031,9 +19696,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21062,10 +19724,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21094,10 +19752,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21154,8 +19808,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21174,8 +19826,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21321,9 +19971,6 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21352,10 +19999,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21384,10 +20027,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21444,8 +20083,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21464,8 +20101,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index a350394bcafe5..5ddabad7374dd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -15,9 +15,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -217,9 +202,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX7-LABEL: flat_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -419,9 +389,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-LABEL: flat_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -438,10 +405,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -459,10 +422,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -496,8 +455,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -513,8 +470,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -633,9 +588,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-LABEL: flat_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -653,10 +605,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -676,10 +624,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -715,8 +659,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -733,8 +675,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -866,9 +806,6 @@ entry: define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-LABEL: flat_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -880,10 +817,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -896,10 +829,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -923,8 +852,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -936,8 +863,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1021,9 +946,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-LABEL: flat_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,10 +957,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1051,10 +969,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1078,8 +992,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1091,8 +1003,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1176,9 +1086,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-LABEL: flat_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1191,10 +1098,6 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1209,10 +1112,6 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1238,8 +1137,6 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1252,8 +1149,6 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1349,9 +1244,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-LABEL: flat_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,10 +1256,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1382,10 +1270,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1411,8 +1295,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1425,8 +1307,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -1522,9 +1402,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1537,10 +1414,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1554,10 +1427,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1583,8 +1452,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1597,8 +1464,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1691,9 +1556,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1707,10 +1569,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1727,10 +1585,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1758,8 +1612,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1773,8 +1625,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -1879,9 +1729,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1895,10 +1742,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -1914,10 +1757,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -1945,8 +1784,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -1960,8 +1797,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2066,9 +1901,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2083,10 +1915,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2105,10 +1933,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2138,8 +1962,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2154,8 +1976,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2272,9 +2092,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2289,10 +2106,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -2311,10 +2124,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -2344,8 +2153,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -2360,8 +2167,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -2478,9 +2283,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2497,10 +2299,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2518,10 +2316,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2555,8 +2349,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2572,8 +2364,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2690,9 +2480,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2710,10 +2497,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2733,10 +2516,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2772,8 +2551,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2790,8 +2567,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2922,9 +2697,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2942,10 +2714,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -2965,10 +2733,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3004,8 +2768,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3022,8 +2784,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -3154,9 +2914,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3182,10 +2939,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3211,10 +2964,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3265,8 +3014,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3282,8 +3029,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3402,9 +3147,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3431,10 +3173,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3463,10 +3201,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3519,8 +3253,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3537,8 +3269,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3669,9 +3399,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3698,10 +3425,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3729,10 +3452,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3785,8 +3504,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3803,8 +3520,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3935,9 +3650,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3965,10 +3677,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3999,10 +3707,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4057,8 +3761,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4076,8 +3778,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4220,9 +3920,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4250,10 +3947,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4284,10 +3977,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4342,8 +4031,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4361,8 +4048,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4505,9 +4190,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4534,10 +4216,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4566,10 +4244,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4622,8 +4296,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4640,8 +4312,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4772,9 +4442,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4801,10 +4468,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4833,10 +4496,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4889,8 +4548,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4907,8 +4564,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5039,9 +4694,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5069,10 +4721,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5103,10 +4751,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5161,8 +4805,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5180,8 +4822,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5324,9 +4964,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5354,10 +4991,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5388,10 +5021,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5446,8 +5075,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5465,8 +5092,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5609,9 +5234,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5639,10 +5261,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5673,10 +5291,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5731,8 +5345,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5750,8 +5362,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5894,9 +5504,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5924,10 +5531,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5958,10 +5561,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6016,8 +5615,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6035,8 +5632,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6179,9 +5774,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6210,10 +5802,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6242,10 +5830,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6302,8 +5886,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6322,8 +5904,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6469,9 +6049,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6501,10 +6078,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6534,10 +6107,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6596,8 +6165,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6617,8 +6184,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6771,9 +6336,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6803,10 +6365,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6837,10 +6395,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6899,8 +6453,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6920,8 +6472,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7079,9 +6629,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7112,10 +6659,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7147,10 +6690,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7211,8 +6750,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7233,8 +6770,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7401,9 +6936,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7434,10 +6966,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7469,10 +6997,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7533,8 +7057,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7555,8 +7077,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7723,9 +7243,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7755,10 +7272,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7788,10 +7301,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7850,8 +7359,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7871,8 +7378,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8027,9 +7532,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8059,10 +7561,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8092,10 +7590,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8154,8 +7648,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8175,8 +7667,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8329,9 +7819,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8362,10 +7849,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8397,10 +7880,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8461,8 +7940,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8483,8 +7960,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8651,9 +8126,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8684,10 +8156,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8719,10 +8187,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8783,8 +8247,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8805,8 +8267,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8973,9 +8433,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9006,10 +8463,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9041,10 +8494,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9105,8 +8554,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9127,8 +8574,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9295,9 +8740,6 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9328,10 +8770,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9363,10 +8801,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9427,8 +8861,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9449,8 +8881,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9617,9 +9047,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9650,10 +9077,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9685,10 +9108,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9749,8 +9168,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9771,8 +9188,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9937,9 +9352,6 @@ entry: define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9970,10 +9382,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10005,10 +9413,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10069,8 +9473,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10091,8 +9493,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10259,9 +9659,6 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10292,10 +9689,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10327,10 +9720,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10391,8 +9780,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10413,8 +9800,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10581,9 +9966,6 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10614,10 +9996,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10649,10 +10027,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10713,8 +10087,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10735,8 +10107,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10903,9 +10273,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX7-LABEL: flat_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10921,10 +10288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -10941,10 +10304,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -10976,8 +10335,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -10992,8 +10349,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11105,9 +10460,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11123,10 +10475,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11143,10 +10491,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11178,8 +10522,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11194,8 +10536,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11307,9 +10647,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-LABEL: flat_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11325,10 +10662,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11347,10 +10680,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11382,8 +10711,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11398,8 +10725,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11517,9 +10842,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11535,10 +10857,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -11559,10 +10877,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -11594,8 +10908,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -11610,8 +10922,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -11739,9 +11049,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-LABEL: flat_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11753,10 +11060,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11769,10 +11072,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11796,8 +11095,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11809,8 +11106,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11894,9 +11189,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11908,10 +11200,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11924,10 +11212,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11951,8 +11235,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -11964,8 +11246,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12049,9 +11329,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-LABEL: flat_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12063,10 +11340,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12081,10 +11354,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12108,8 +11377,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12121,8 +11388,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12215,9 +11480,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12229,10 +11491,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12247,10 +11505,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12274,8 +11528,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12287,8 +11539,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 @@ -12381,9 +11631,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12396,10 +11643,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12413,10 +11656,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12442,8 +11681,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12456,8 +11693,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12550,9 +11785,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12565,10 +11797,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12584,10 +11812,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12613,8 +11837,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12627,8 +11849,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12729,9 +11949,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12744,10 +11961,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12763,10 +11976,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12792,8 +12001,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12806,8 +12013,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -12909,9 +12114,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12924,10 +12126,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -12945,10 +12143,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -12974,8 +12168,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -12988,8 +12180,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13099,9 +12289,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13114,10 +12301,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -13135,10 +12318,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -13164,8 +12343,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -13178,8 +12355,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -13289,9 +12464,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13307,10 +12479,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13329,10 +12497,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13364,8 +12528,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13380,8 +12542,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13497,9 +12657,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13515,10 +12672,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13539,10 +12692,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13574,8 +12723,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13590,8 +12737,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13718,9 +12863,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13736,10 +12878,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13760,10 +12898,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13795,8 +12929,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13811,8 +12943,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 @@ -13939,9 +13069,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13967,10 +13094,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13996,10 +13119,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14050,8 +13169,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14067,8 +13184,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14187,9 +13302,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14215,10 +13327,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14246,10 +13354,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14300,8 +13404,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14317,8 +13419,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14445,9 +13545,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14473,10 +13570,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14504,10 +13597,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14558,8 +13647,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14575,8 +13662,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14704,9 +13789,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14732,10 +13814,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14765,10 +13843,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14819,8 +13893,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14836,8 +13908,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14973,9 +14043,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15001,10 +14068,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15034,10 +14097,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15088,8 +14147,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15105,8 +14162,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15242,9 +14297,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15270,10 +14322,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15301,10 +14349,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15355,8 +14399,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15372,8 +14414,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15500,9 +14540,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15528,10 +14565,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15559,10 +14592,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15613,8 +14642,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15630,8 +14657,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15758,9 +14783,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15786,10 +14808,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15819,10 +14837,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15873,8 +14887,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15890,8 +14902,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16027,9 +15037,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16055,10 +15062,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16088,10 +15091,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16142,8 +15141,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16159,8 +15156,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16296,9 +15291,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16324,10 +15316,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16357,10 +15345,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16411,8 +15395,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16428,8 +15410,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16565,9 +15545,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16593,10 +15570,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16626,10 +15599,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16680,8 +15649,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16697,8 +15664,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16834,9 +15799,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16862,10 +15824,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16895,10 +15853,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16949,8 +15903,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16966,8 +15918,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17103,9 +16053,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17131,10 +16078,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17164,10 +16107,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17218,8 +16157,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17235,8 +16172,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17372,9 +16307,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17400,10 +16332,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17433,10 +16361,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17487,8 +16411,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17504,8 +16426,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17641,9 +16561,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17669,10 +16586,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17702,10 +16615,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17756,8 +16665,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17773,8 +16680,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17910,9 +16815,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17941,10 +16843,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17973,10 +16871,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18033,8 +16927,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18053,8 +16945,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18200,9 +17090,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18231,10 +17118,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18265,10 +17148,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18325,8 +17204,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18345,8 +17222,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18498,9 +17373,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18529,10 +17401,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18563,10 +17431,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18623,8 +17487,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18643,8 +17505,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18799,9 +17659,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18830,10 +17687,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18866,10 +17719,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18926,8 +17775,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18946,8 +17793,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19110,9 +17955,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19141,10 +17983,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19177,10 +18015,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19237,8 +18071,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19257,8 +18089,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19421,9 +18251,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19452,10 +18279,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19486,10 +18309,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19546,8 +18365,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19566,8 +18383,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19721,9 +18536,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19752,10 +18564,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19786,10 +18594,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19846,8 +18650,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19866,8 +18668,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20019,9 +18819,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20050,10 +18847,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20086,10 +18879,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20146,8 +18935,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20166,8 +18953,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20330,9 +19115,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20361,10 +19143,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20397,10 +19175,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20457,8 +19231,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20477,8 +19249,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20641,9 +19411,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20672,10 +19439,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20708,10 +19471,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20768,8 +19527,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20788,8 +19545,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20952,9 +19707,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20983,10 +19735,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21019,10 +19767,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21079,8 +19823,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21099,8 +19841,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21263,9 +20003,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21294,10 +20031,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21330,10 +20063,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21390,8 +20119,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21410,8 +20137,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21572,9 +20297,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21603,10 +20325,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21639,10 +20357,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21699,8 +20413,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21719,8 +20431,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21883,9 +20593,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21914,10 +20621,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21950,10 +20653,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22010,8 +20709,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22030,8 +20727,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22194,9 +20889,6 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22225,10 +20917,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s10, s10, s15 -; GFX10-WGP-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22261,10 +20949,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s10, s10, s15 -; GFX10-CU-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22321,8 +21005,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22341,8 +21023,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 8d14f92d9806e..8a02ad5dfdb7b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -40,9 +40,6 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX7-LABEL: global_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -241,9 +238,6 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX7-LABEL: global_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -443,9 +437,6 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX7-LABEL: global_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -661,9 +652,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX7-LABEL: global_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -888,9 +876,6 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX7-LABEL: global_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1060,9 +1045,6 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX7-LABEL: global_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1233,9 +1215,6 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX7-LABEL: global_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1432,9 +1411,6 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX7-LABEL: global_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,9 +1606,6 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1793,9 +1766,6 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1982,9 +1952,6 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX7-LABEL: global_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2172,9 +2139,6 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2389,9 +2353,6 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2606,9 +2567,6 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2812,9 +2770,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,9 +3003,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3284,9 +3236,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3515,9 +3464,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3772,9 +3718,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4030,9 +3973,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4315,9 +4255,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4599,9 +4536,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4857,9 +4791,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5116,9 +5047,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5401,9 +5329,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5686,9 +5611,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5971,9 +5893,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6256,9 +6175,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6541,9 +6457,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6826,9 +6739,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7111,9 +7021,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -7396,9 +7303,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7657,9 +7561,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7934,9 +7835,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8222,9 +8120,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8530,9 +8425,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8837,9 +8729,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9118,9 +9007,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9396,9 +9282,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9704,9 +9587,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10012,9 +9892,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10320,9 +10197,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10628,9 +10502,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10932,9 +10803,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11240,9 +11108,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11548,9 +11413,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11854,9 +11716,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX7-LABEL: global_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12055,9 +11914,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX7-LABEL: global_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12257,9 +12113,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX7-LABEL: global_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12475,9 +12328,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -12702,9 +12552,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX7-LABEL: global_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12874,9 +12721,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX7-LABEL: global_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13047,9 +12891,6 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX7-LABEL: global_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13246,9 +13087,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13444,9 +13282,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13607,9 +13442,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13796,9 +13628,6 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13986,9 +13815,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14203,9 +14029,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -14420,9 +14243,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14626,9 +14446,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14862,9 +14679,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15098,9 +14912,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15329,9 +15140,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15586,9 +15394,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15844,9 +15649,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16129,9 +15931,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16413,9 +16212,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16671,9 +16467,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16930,9 +16723,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17215,9 +17005,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17500,9 +17287,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17785,9 +17569,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18070,9 +17851,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18355,9 +18133,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18640,9 +18415,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18925,9 +18697,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -19210,9 +18979,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19471,9 +19237,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19749,9 +19512,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20057,9 +19817,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20364,9 +20121,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20645,9 +20399,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20923,9 +20674,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21231,9 +20979,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21539,9 +21284,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21847,9 +21589,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22155,9 +21894,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22459,9 +22195,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22767,9 +22500,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23075,9 +22805,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 1069cb6f0135d..14f1734235673 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -35,9 +35,6 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX7-LABEL: global_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -245,9 +242,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX7-LABEL: global_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -512,9 +506,6 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX7-LABEL: global_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -717,9 +708,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX7-LABEL: global_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -973,9 +961,6 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX7-LABEL: global_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index bf4d77ad61c6b..33aaeebf658dd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -40,9 +40,6 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX7-LABEL: global_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -241,9 +238,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX7-LABEL: global_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -442,9 +436,6 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX7-LABEL: global_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -643,9 +634,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -837,9 +825,6 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX7-LABEL: global_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1009,9 +994,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX7-LABEL: global_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1181,9 +1163,6 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX7-LABEL: global_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,9 +1332,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,9 +1501,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1686,9 +1659,6 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1847,9 +1817,6 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2008,9 +1975,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2169,9 +2133,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2332,9 +2293,6 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2521,9 +2479,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2710,9 +2665,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2901,9 +2853,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3130,9 +3079,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3359,9 +3305,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3588,9 +3531,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3817,9 +3757,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4046,9 +3983,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4275,9 +4209,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4504,9 +4435,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4733,9 +4661,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4962,9 +4887,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5191,9 +5113,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5420,9 +5339,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5649,9 +5565,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5878,9 +5791,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6107,9 +6017,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6339,9 +6246,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6599,9 +6503,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6859,9 +6760,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7119,9 +7017,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7379,9 +7274,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7639,9 +7531,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7899,9 +7788,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8159,9 +8045,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8419,9 +8302,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8679,9 +8559,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8939,9 +8816,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9199,9 +9073,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9459,9 +9330,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9719,9 +9587,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9979,9 +9844,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10239,9 +10101,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10440,9 +10299,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10641,9 +10497,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10842,9 +10695,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11036,9 +10886,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11208,9 +11055,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11380,9 +11224,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX7-LABEL: global_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11552,9 +11393,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11724,9 +11562,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11885,9 +11720,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12046,9 +11878,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12207,9 +12036,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12368,9 +12194,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12531,9 +12354,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12720,9 +12540,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12909,9 +12726,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13100,9 +12914,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13329,9 +13140,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13558,9 +13366,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13787,9 +13592,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14016,9 +13818,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14245,9 +14044,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14474,9 +14270,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14703,9 +14496,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14932,9 +14722,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15161,9 +14948,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15390,9 +15174,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15619,9 +15400,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15848,9 +15626,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16077,9 +15852,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16306,9 +16078,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16538,9 +16307,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16798,9 +16564,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17058,9 +16821,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17318,9 +17078,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17578,9 +17335,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17838,9 +17592,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18098,9 +17849,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18358,9 +18106,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18618,9 +18363,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18878,9 +18620,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19138,9 +18877,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19398,9 +19134,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19658,9 +19391,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19918,9 +19648,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20178,9 +19905,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index ecef93dcd84d9..ae5ec082024fd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -40,9 +40,6 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX7-LABEL: global_system_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -241,9 +238,6 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX7-LABEL: global_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -443,9 +437,6 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX7-LABEL: global_system_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -663,9 +654,6 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX7-LABEL: global_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -892,9 +880,6 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX7-LABEL: global_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1064,9 +1049,6 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX7-LABEL: global_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1237,9 +1219,6 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX7-LABEL: global_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1438,9 +1417,6 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX7-LABEL: global_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1638,9 +1614,6 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1801,9 +1774,6 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1992,9 +1962,6 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX7-LABEL: global_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2184,9 +2151,6 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2405,9 +2369,6 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2626,9 +2587,6 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2834,9 +2792,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3074,9 +3029,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3314,9 +3266,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3545,9 +3494,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3804,9 +3750,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4064,9 +4007,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4353,9 +4293,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4641,9 +4578,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4901,9 +4835,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5162,9 +5093,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5451,9 +5379,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5740,9 +5665,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6029,9 +5951,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6318,9 +6237,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6579,9 +6495,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6859,9 +6772,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7171,9 +7081,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7482,9 +7389,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7765,9 +7669,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8045,9 +7946,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8357,9 +8255,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8669,9 +8564,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8981,9 +8873,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9293,9 +9182,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9601,9 +9487,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9913,9 +9796,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10225,9 +10105,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10535,9 +10412,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX7-LABEL: global_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10736,9 +10610,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX7-LABEL: global_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10938,9 +10809,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX7-LABEL: global_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11158,9 +11026,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX7-LABEL: global_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11387,9 +11252,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX7-LABEL: global_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11559,9 +11421,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX7-LABEL: global_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11732,9 +11591,6 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX7-LABEL: global_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11933,9 +11789,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX7-LABEL: global_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12133,9 +11986,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12296,9 +12146,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12487,9 +12334,6 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12679,9 +12523,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12900,9 +12741,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13121,9 +12959,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13329,9 +13164,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13569,9 +13401,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13809,9 +13638,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14040,9 +13866,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14299,9 +14122,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14559,9 +14379,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14848,9 +14665,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15136,9 +14950,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15396,9 +15207,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15657,9 +15465,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15946,9 +15751,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16235,9 +16037,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16524,9 +16323,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16813,9 +16609,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17102,9 +16895,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17391,9 +17181,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17680,9 +17467,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17969,9 +17753,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18230,9 +18011,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18509,9 +18287,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18799,9 +18574,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19111,9 +18883,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19422,9 +19191,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19705,9 +19471,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19985,9 +19748,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20297,9 +20057,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20609,9 +20366,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20921,9 +20675,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21233,9 +20984,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21541,9 +21289,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21853,9 +21598,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22165,9 +21907,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index eea71794e549e..d916ff533e77b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -36,9 +36,6 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX7-LABEL: global_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -195,9 +192,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX7-LABEL: global_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -395,9 +389,6 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX7-LABEL: global_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -563,9 +554,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX7-LABEL: global_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -769,9 +757,6 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: global_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -915,9 +900,6 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX7-LABEL: global_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 95bc4ddd0cff7..aaa11c0455606 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -40,9 +40,6 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX7-LABEL: global_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -241,9 +238,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX7-LABEL: global_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -442,9 +436,6 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX7-LABEL: global_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -643,9 +634,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -837,9 +825,6 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX7-LABEL: global_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1009,9 +994,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX7-LABEL: global_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1181,9 +1163,6 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX7-LABEL: global_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,9 +1332,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,9 +1501,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1686,9 +1659,6 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1847,9 +1817,6 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2008,9 +1975,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2169,9 +2133,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2332,9 +2293,6 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2521,9 +2479,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2710,9 +2665,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2901,9 +2853,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3130,9 +3079,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3359,9 +3305,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3588,9 +3531,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3817,9 +3757,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4046,9 +3983,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4275,9 +4209,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4504,9 +4435,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4733,9 +4661,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4962,9 +4887,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5191,9 +5113,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5420,9 +5339,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5649,9 +5565,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5878,9 +5791,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6107,9 +6017,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6339,9 +6246,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6599,9 +6503,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6859,9 +6760,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7119,9 +7017,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7379,9 +7274,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7639,9 +7531,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7899,9 +7788,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8159,9 +8045,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8419,9 +8302,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8679,9 +8559,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8939,9 +8816,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9199,9 +9073,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9459,9 +9330,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9719,9 +9587,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9979,9 +9844,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10239,9 +10101,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10440,9 +10299,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10641,9 +10497,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -10842,9 +10695,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11036,9 +10886,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11208,9 +11055,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11380,9 +11224,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX7-LABEL: global_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11552,9 +11393,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11724,9 +11562,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -11885,9 +11720,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12046,9 +11878,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12207,9 +12036,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12368,9 +12194,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12531,9 +12354,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12720,9 +12540,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12909,9 +12726,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13100,9 +12914,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13329,9 +13140,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13558,9 +13366,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13787,9 +13592,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14016,9 +13818,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14245,9 +14044,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14474,9 +14270,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14703,9 +14496,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14932,9 +14722,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15161,9 +14948,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15390,9 +15174,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15619,9 +15400,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15848,9 +15626,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16077,9 +15852,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16306,9 +16078,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16538,9 +16307,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16798,9 +16564,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17058,9 +16821,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17318,9 +17078,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17578,9 +17335,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17838,9 +17592,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18098,9 +17849,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18358,9 +18106,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18618,9 +18363,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18878,9 +18620,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19138,9 +18877,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19398,9 +19134,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19658,9 +19391,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19918,9 +19648,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20178,9 +19905,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 28c6f255e86a8..26511f079fa8f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -40,9 +40,6 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX7-LABEL: global_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -241,9 +238,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX7-LABEL: global_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -442,9 +436,6 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX7-LABEL: global_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -649,9 +640,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -859,9 +847,6 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX7-LABEL: global_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1031,9 +1016,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX7-LABEL: global_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1204,9 +1186,6 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX7-LABEL: global_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1395,9 +1374,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1585,9 +1561,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1746,9 +1719,6 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -1918,9 +1888,6 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2098,9 +2065,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2288,9 +2252,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -2479,9 +2440,6 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2674,9 +2632,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2889,9 +2844,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3105,9 +3057,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3334,9 +3283,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3574,9 +3520,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3822,9 +3765,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4080,9 +4020,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4337,9 +4274,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4576,9 +4510,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4816,9 +4747,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5074,9 +5002,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5332,9 +5257,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5590,9 +5512,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5848,9 +5767,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6106,9 +6022,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6364,9 +6277,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6622,9 +6532,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6882,9 +6789,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7142,9 +7046,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7408,9 +7309,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7687,9 +7585,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7973,9 +7868,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8258,9 +8150,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8525,9 +8414,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8791,9 +8677,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9077,9 +8960,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9363,9 +9243,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9649,9 +9526,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9935,9 +9809,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10219,9 +10090,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10505,9 +10373,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10791,9 +10656,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11076,9 +10938,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11277,9 +11136,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11478,9 +11334,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11684,9 +11537,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -11891,9 +11741,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12063,9 +11910,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12235,9 +12079,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX7-LABEL: global_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12418,9 +12259,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12601,9 +12439,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12762,9 +12597,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -12933,9 +12765,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13105,9 +12934,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13287,9 +13113,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 @@ -13471,9 +13294,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13665,9 +13485,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13872,9 +13689,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14081,9 +13895,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14310,9 +14121,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14549,9 +14357,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14789,9 +14594,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15039,9 +14841,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15289,9 +15088,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15528,9 +15324,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15767,9 +15560,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16017,9 +15807,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16267,9 +16054,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16517,9 +16301,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16767,9 +16548,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17017,9 +16795,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17267,9 +17042,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17517,9 +17289,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17770,9 +17539,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18030,9 +17796,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18295,9 +18058,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18566,9 +18326,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18844,9 +18601,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19122,9 +18876,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19389,9 +19140,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19654,9 +19402,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19932,9 +19677,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20210,9 +19952,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20488,9 +20227,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20766,9 +20502,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21042,9 +20775,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21320,9 +21050,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21598,9 +21325,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 24598bcbea5bb..fce60ff12aed3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -37,9 +37,6 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX7-LABEL: local_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -243,9 +240,6 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX7-LABEL: local_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -901,9 +895,6 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX7-LABEL: local_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 455834572a59a..a8f7051bd5050 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -33,9 +33,6 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX7-LABEL: local_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -183,9 +180,6 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX7-LABEL: local_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 55c40ee491100..c3599c87985be 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -37,10 +37,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX7-LABEL: private_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -56,7 +53,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-WGP-LABEL: private_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -72,7 +69,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-CU-LABEL: private_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -113,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -129,7 +126,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -251,10 +248,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX7-LABEL: private_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -272,7 +266,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-WGP-LABEL: private_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 @@ -290,7 +284,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-CU-LABEL: private_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 @@ -335,7 +329,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 @@ -356,7 +350,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 @@ -513,7 +507,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX7-LABEL: private_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -528,7 +522,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-WGP-LABEL: private_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -544,7 +538,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-CU-LABEL: private_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -579,7 +573,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -595,7 +589,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -711,7 +705,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX7-LABEL: private_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -728,7 +722,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-WGP-LABEL: private_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -744,7 +738,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-CU-LABEL: private_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -781,7 +775,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -800,7 +794,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -951,10 +945,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX7-LABEL: private_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -970,7 +961,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: private_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -986,7 +977,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: private_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1027,7 +1018,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1043,7 +1034,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index cdb1b463ac321..9146f175eefcd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -37,10 +37,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX7-LABEL: private_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -56,7 +53,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-WGP-LABEL: private_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -72,7 +69,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-CU-LABEL: private_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -201,10 +198,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX7-LABEL: private_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX7-NEXT: s_add_i32 s10, s10, s15 -; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -222,7 +216,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-WGP-LABEL: private_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 @@ -240,7 +234,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-CU-LABEL: private_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 @@ -392,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX7-LABEL: private_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -408,7 +402,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-WGP-LABEL: private_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -425,7 +419,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-CU-LABEL: private_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -555,7 +549,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX7-LABEL: private_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s13 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -573,7 +567,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-WGP-LABEL: private_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -590,7 +584,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-CU-LABEL: private_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 86cdf3ccd0441..02ce58dd75403 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -34,13 +34,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -59,13 +56,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -152,9 +146,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -166,9 +157,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -230,9 +218,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -244,9 +229,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -310,9 +292,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -331,9 +310,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -446,14 +422,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_byte v[0:1], v2 @@ -464,14 +437,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -589,8 +559,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -614,7 +582,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_or_b32 s2, s3, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -624,8 +591,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 @@ -649,7 +614,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -804,9 +768,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -826,9 +787,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -963,9 +921,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -996,9 +951,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -1097,13 +1049,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1122,13 +1071,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1244,13 +1190,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1269,13 +1212,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1363,9 +1303,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1377,9 +1314,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1443,9 +1377,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1460,9 +1391,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1537,9 +1465,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1552,9 +1477,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1622,9 +1544,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1637,9 +1556,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1719,13 +1635,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1744,13 +1657,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1856,15 +1766,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, s5 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1883,15 +1790,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -2025,15 +1929,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -2064,15 +1965,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -2171,9 +2069,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2185,9 +2080,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2262,13 +2154,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2287,13 +2176,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2399,9 +2285,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2423,9 +2306,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2513,9 +2393,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2527,9 +2404,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2613,9 +2487,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2637,9 +2508,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2769,9 +2637,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2794,9 +2659,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2889,9 +2751,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2903,9 +2762,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2977,9 +2833,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3011,9 +2864,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3274,9 +3124,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3323,9 +3170,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3451,14 +3295,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3469,14 +3310,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3566,14 +3404,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; CI-NEXT: s_load_dword s2, s[6:7], 0xa ; CI-NEXT: s_load_dword s3, s[6:7], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3584,14 +3419,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3688,9 +3520,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3705,9 +3534,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3790,9 +3616,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3811,9 +3634,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3906,9 +3726,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3927,9 +3744,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4022,9 +3836,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -4043,9 +3854,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4138,9 +3946,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -4159,9 +3964,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4278,12 +4080,9 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -4312,13 +4111,10 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -4436,12 +4232,9 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -4469,13 +4262,10 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index d7814c52828b8..5792fab7011af 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -180,9 +180,6 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -263,9 +260,6 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -347,9 +341,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -412,9 +403,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -477,9 +465,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -542,9 +527,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -606,9 +588,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX8-NEXT: s_add_i32 s10, s10, s15 -; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 3de6945f95556..529e64715500d 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -176,9 +176,6 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -257,9 +254,6 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -339,9 +333,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -402,9 +393,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -466,9 +454,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -529,9 +514,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 993eb11ef5e95..5a8d0f3d0f158 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -10,36 +10,36 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %27 - ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %27 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %24 - ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26 + ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23 + ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy ; PEI-GFX908: bb.0 (%ir-block.0): - ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 + ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 ; PEI-GFX908-NEXT: {{ $}} - ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) @@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -59,34 +59,34 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %26 - ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %26 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %24 - ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25 + ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; ; PEI-GFX90A-LABEL: name: partial_copy ; PEI-GFX90A: bb.0 (%ir-block.0): - ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 + ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 847c2d343d415..c2132cf907fdb 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -6,9 +6,6 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -36,12 +33,9 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -63,9 +57,6 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -92,9 +83,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s16, s16, s15 +; GCN-NEXT: s_add_u32 s16, s16, s13 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -103,7 +92,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -131,15 +119,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s16, s16, s15 +; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen @@ -166,9 +151,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s16, s16, s15 +; GCN-NEXT: s_add_u32 s16, s16, s13 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 @@ -179,7 +162,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -204,9 +186,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s16, s16, s15 +; GCN-NEXT: s_add_u32 s16, s16, s13 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -217,7 +197,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -243,9 +222,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s16, s16, s15 +; GCN-NEXT: s_add_u32 s16, s16, s13 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 @@ -256,7 +233,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -279,9 +255,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s16, s16, s15 +; GCN-NEXT: s_add_u32 s16, s16, s13 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -290,7 +264,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -312,9 +285,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -351,9 +321,6 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -391,8 +358,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 @@ -400,7 +365,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_sad_u32 v2, s4, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -420,9 +384,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i16_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -455,9 +416,6 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -485,9 +443,6 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i8_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -520,9 +475,6 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -550,9 +502,6 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -582,9 +531,6 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 9826585df8bd8..89a09dc4fcc17 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -9,8 +9,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -26,8 +24,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX906-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -43,8 +39,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX908-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -61,8 +55,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -96,8 +88,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -113,8 +103,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX906-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -130,8 +118,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX908-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -148,8 +134,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 5d0ddcc7114c2..5dcb4b7c979ab 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -24,179 +24,175 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ; def s[4:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v23, s4, 2 ; CHECK-NEXT: v_writelane_b32 v23, s5, 3 ; CHECK-NEXT: v_writelane_b32 v23, s6, 4 ; CHECK-NEXT: v_writelane_b32 v23, s7, 5 -; CHECK-NEXT: v_writelane_b32 v23, s8, 6 -; CHECK-NEXT: v_writelane_b32 v23, s9, 7 -; CHECK-NEXT: v_writelane_b32 v23, s10, 8 -; CHECK-NEXT: v_writelane_b32 v23, s11, 9 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s4, 6 +; CHECK-NEXT: v_writelane_b32 v23, s5, 7 +; CHECK-NEXT: v_writelane_b32 v23, s6, 8 +; CHECK-NEXT: v_writelane_b32 v23, s7, 9 +; CHECK-NEXT: v_writelane_b32 v23, s8, 10 +; CHECK-NEXT: v_writelane_b32 v23, s9, 11 +; CHECK-NEXT: v_writelane_b32 v23, s10, 12 +; CHECK-NEXT: v_writelane_b32 v23, s11, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 10 -; CHECK-NEXT: v_writelane_b32 v23, s5, 11 -; CHECK-NEXT: v_writelane_b32 v23, s6, 12 -; CHECK-NEXT: v_writelane_b32 v23, s7, 13 -; CHECK-NEXT: v_writelane_b32 v23, s8, 14 -; CHECK-NEXT: v_writelane_b32 v23, s9, 15 -; CHECK-NEXT: v_writelane_b32 v23, s10, 16 -; CHECK-NEXT: v_writelane_b32 v23, s11, 17 -; CHECK-NEXT: v_writelane_b32 v23, s12, 18 -; CHECK-NEXT: v_writelane_b32 v23, s13, 19 -; CHECK-NEXT: v_writelane_b32 v23, s14, 20 -; CHECK-NEXT: v_writelane_b32 v23, s15, 21 -; CHECK-NEXT: v_writelane_b32 v23, s16, 22 -; CHECK-NEXT: v_writelane_b32 v23, s17, 23 -; CHECK-NEXT: v_writelane_b32 v23, s18, 24 -; CHECK-NEXT: v_writelane_b32 v23, s19, 25 +; CHECK-NEXT: v_writelane_b32 v23, s4, 14 +; CHECK-NEXT: v_writelane_b32 v23, s5, 15 +; CHECK-NEXT: v_writelane_b32 v23, s6, 16 +; CHECK-NEXT: v_writelane_b32 v23, s7, 17 +; CHECK-NEXT: v_writelane_b32 v23, s8, 18 +; CHECK-NEXT: v_writelane_b32 v23, s9, 19 +; CHECK-NEXT: v_writelane_b32 v23, s10, 20 +; CHECK-NEXT: v_writelane_b32 v23, s11, 21 +; CHECK-NEXT: v_writelane_b32 v23, s12, 22 +; CHECK-NEXT: v_writelane_b32 v23, s13, 23 +; CHECK-NEXT: v_writelane_b32 v23, s14, 24 +; CHECK-NEXT: v_writelane_b32 v23, s15, 25 +; CHECK-NEXT: v_writelane_b32 v23, s16, 26 +; CHECK-NEXT: v_writelane_b32 v23, s17, 27 +; CHECK-NEXT: v_writelane_b32 v23, s18, 28 +; CHECK-NEXT: v_writelane_b32 v23, s19, 29 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[38:39] +; CHECK-NEXT: ; def s[42:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ; def s[52:55] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 26 -; CHECK-NEXT: v_writelane_b32 v23, s5, 27 -; CHECK-NEXT: v_writelane_b32 v23, s6, 28 -; CHECK-NEXT: v_writelane_b32 v23, s7, 29 -; CHECK-NEXT: v_writelane_b32 v23, s8, 30 -; CHECK-NEXT: v_writelane_b32 v23, s9, 31 -; CHECK-NEXT: v_writelane_b32 v23, s10, 32 -; CHECK-NEXT: v_writelane_b32 v23, s11, 33 +; CHECK-NEXT: v_writelane_b32 v23, s4, 30 +; CHECK-NEXT: v_writelane_b32 v23, s5, 31 +; CHECK-NEXT: v_writelane_b32 v23, s6, 32 +; CHECK-NEXT: v_writelane_b32 v23, s7, 33 +; CHECK-NEXT: v_writelane_b32 v23, s8, 34 +; CHECK-NEXT: v_writelane_b32 v23, s9, 35 +; CHECK-NEXT: v_writelane_b32 v23, s10, 36 +; CHECK-NEXT: v_writelane_b32 v23, s11, 37 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:37] +; CHECK-NEXT: ; def s[40:41] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[40:43] +; CHECK-NEXT: ; def s[36:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ; def s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 34 -; CHECK-NEXT: v_writelane_b32 v23, s1, 35 -; CHECK-NEXT: v_writelane_b32 v23, s2, 36 -; CHECK-NEXT: v_writelane_b32 v23, s3, 37 -; CHECK-NEXT: v_writelane_b32 v23, s4, 38 -; CHECK-NEXT: v_writelane_b32 v23, s5, 39 -; CHECK-NEXT: v_writelane_b32 v23, s6, 40 -; CHECK-NEXT: v_writelane_b32 v23, s7, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 42 -; CHECK-NEXT: v_writelane_b32 v23, s1, 43 -; CHECK-NEXT: v_writelane_b32 v23, s2, 44 -; CHECK-NEXT: v_writelane_b32 v23, s3, 45 -; CHECK-NEXT: v_writelane_b32 v23, s4, 46 -; CHECK-NEXT: v_writelane_b32 v23, s5, 47 -; CHECK-NEXT: v_writelane_b32 v23, s6, 48 -; CHECK-NEXT: v_writelane_b32 v23, s7, 49 -; CHECK-NEXT: v_writelane_b32 v23, s8, 50 -; CHECK-NEXT: v_writelane_b32 v23, s9, 51 -; CHECK-NEXT: v_writelane_b32 v23, s10, 52 -; CHECK-NEXT: v_writelane_b32 v23, s11, 53 -; CHECK-NEXT: v_writelane_b32 v23, s12, 54 -; CHECK-NEXT: v_writelane_b32 v23, s13, 55 -; CHECK-NEXT: v_writelane_b32 v23, s14, 56 -; CHECK-NEXT: v_writelane_b32 v23, s15, 57 +; CHECK-NEXT: v_writelane_b32 v23, s0, 38 +; CHECK-NEXT: v_writelane_b32 v23, s1, 39 +; CHECK-NEXT: v_writelane_b32 v23, s2, 40 +; CHECK-NEXT: v_writelane_b32 v23, s3, 41 +; CHECK-NEXT: v_writelane_b32 v23, s4, 42 +; CHECK-NEXT: v_writelane_b32 v23, s5, 43 +; CHECK-NEXT: v_writelane_b32 v23, s6, 44 +; CHECK-NEXT: v_writelane_b32 v23, s7, 45 +; CHECK-NEXT: v_writelane_b32 v23, s8, 46 +; CHECK-NEXT: v_writelane_b32 v23, s9, 47 +; CHECK-NEXT: v_writelane_b32 v23, s10, 48 +; CHECK-NEXT: v_writelane_b32 v23, s11, 49 +; CHECK-NEXT: v_writelane_b32 v23, s12, 50 +; CHECK-NEXT: v_writelane_b32 v23, s13, 51 +; CHECK-NEXT: v_writelane_b32 v23, s14, 52 +; CHECK-NEXT: v_writelane_b32 v23, s15, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 54 +; CHECK-NEXT: v_writelane_b32 v23, s1, 55 +; CHECK-NEXT: v_writelane_b32 v23, s2, 56 +; CHECK-NEXT: v_writelane_b32 v23, s3, 57 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v23, s0, 58 ; CHECK-NEXT: v_writelane_b32 v23, s1, 59 ; CHECK-NEXT: v_writelane_b32 v23, s2, 60 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: v_writelane_b32 v23, s3, 61 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 62 -; CHECK-NEXT: v_writelane_b32 v0, s2, 0 -; CHECK-NEXT: v_writelane_b32 v0, s3, 1 -; CHECK-NEXT: v_writelane_b32 v0, s4, 2 -; CHECK-NEXT: v_writelane_b32 v0, s5, 3 -; CHECK-NEXT: v_writelane_b32 v0, s6, 4 -; CHECK-NEXT: v_writelane_b32 v23, s1, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 5 +; CHECK-NEXT: v_writelane_b32 v23, s4, 62 +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v23, s5, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 6 -; CHECK-NEXT: v_writelane_b32 v0, s1, 7 -; CHECK-NEXT: v_writelane_b32 v0, s2, 8 -; CHECK-NEXT: v_writelane_b32 v0, s3, 9 -; CHECK-NEXT: v_writelane_b32 v0, s4, 10 -; CHECK-NEXT: v_writelane_b32 v0, s5, 11 -; CHECK-NEXT: v_writelane_b32 v0, s6, 12 -; CHECK-NEXT: v_writelane_b32 v0, s7, 13 -; CHECK-NEXT: v_writelane_b32 v0, s8, 14 -; CHECK-NEXT: v_writelane_b32 v0, s9, 15 -; CHECK-NEXT: v_writelane_b32 v0, s10, 16 -; CHECK-NEXT: v_writelane_b32 v0, s11, 17 -; CHECK-NEXT: v_writelane_b32 v0, s12, 18 -; CHECK-NEXT: v_writelane_b32 v0, s13, 19 -; CHECK-NEXT: v_writelane_b32 v0, s14, 20 -; CHECK-NEXT: v_writelane_b32 v0, s15, 21 +; CHECK-NEXT: v_writelane_b32 v0, s0, 2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 3 +; CHECK-NEXT: v_writelane_b32 v0, s2, 4 +; CHECK-NEXT: v_writelane_b32 v0, s3, 5 +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: v_writelane_b32 v0, s12, 14 +; CHECK-NEXT: v_writelane_b32 v0, s13, 15 +; CHECK-NEXT: v_writelane_b32 v0, s14, 16 +; CHECK-NEXT: v_writelane_b32 v0, s15, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 22 -; CHECK-NEXT: v_writelane_b32 v0, s1, 23 +; CHECK-NEXT: v_writelane_b32 v0, s0, 18 +; CHECK-NEXT: v_writelane_b32 v0, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 20 +; CHECK-NEXT: v_writelane_b32 v0, s1, 21 +; CHECK-NEXT: v_writelane_b32 v0, s2, 22 +; CHECK-NEXT: v_writelane_b32 v0, s3, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v0, s0, 24 ; CHECK-NEXT: v_writelane_b32 v0, s1, 25 ; CHECK-NEXT: v_writelane_b32 v0, s2, 26 ; CHECK-NEXT: v_writelane_b32 v0, s3, 27 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 28 -; CHECK-NEXT: v_writelane_b32 v0, s1, 29 -; CHECK-NEXT: v_writelane_b32 v0, s2, 30 -; CHECK-NEXT: v_writelane_b32 v0, s3, 31 -; CHECK-NEXT: v_writelane_b32 v0, s4, 32 -; CHECK-NEXT: v_writelane_b32 v0, s5, 33 -; CHECK-NEXT: v_writelane_b32 v0, s6, 34 -; CHECK-NEXT: v_writelane_b32 v0, s7, 35 +; CHECK-NEXT: v_writelane_b32 v0, s4, 28 +; CHECK-NEXT: v_writelane_b32 v0, s5, 29 +; CHECK-NEXT: v_writelane_b32 v0, s6, 30 +; CHECK-NEXT: v_writelane_b32 v0, s7, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 36 -; CHECK-NEXT: v_writelane_b32 v0, s1, 37 -; CHECK-NEXT: v_writelane_b32 v0, s2, 38 -; CHECK-NEXT: v_writelane_b32 v0, s3, 39 -; CHECK-NEXT: v_writelane_b32 v0, s4, 40 -; CHECK-NEXT: v_writelane_b32 v0, s5, 41 -; CHECK-NEXT: v_writelane_b32 v0, s6, 42 -; CHECK-NEXT: v_writelane_b32 v0, s7, 43 -; CHECK-NEXT: v_writelane_b32 v0, s8, 44 -; CHECK-NEXT: v_writelane_b32 v0, s9, 45 -; CHECK-NEXT: v_writelane_b32 v0, s10, 46 -; CHECK-NEXT: v_writelane_b32 v0, s11, 47 -; CHECK-NEXT: v_writelane_b32 v0, s12, 48 -; CHECK-NEXT: v_writelane_b32 v0, s13, 49 -; CHECK-NEXT: v_writelane_b32 v0, s14, 50 -; CHECK-NEXT: v_writelane_b32 v0, s15, 51 +; CHECK-NEXT: v_writelane_b32 v0, s0, 32 +; CHECK-NEXT: v_writelane_b32 v0, s1, 33 +; CHECK-NEXT: v_writelane_b32 v0, s2, 34 +; CHECK-NEXT: v_writelane_b32 v0, s3, 35 +; CHECK-NEXT: v_writelane_b32 v0, s4, 36 +; CHECK-NEXT: v_writelane_b32 v0, s5, 37 +; CHECK-NEXT: v_writelane_b32 v0, s6, 38 +; CHECK-NEXT: v_writelane_b32 v0, s7, 39 +; CHECK-NEXT: v_writelane_b32 v0, s8, 40 +; CHECK-NEXT: v_writelane_b32 v0, s9, 41 +; CHECK-NEXT: v_writelane_b32 v0, s10, 42 +; CHECK-NEXT: v_writelane_b32 v0, s11, 43 +; CHECK-NEXT: v_writelane_b32 v0, s12, 44 +; CHECK-NEXT: v_writelane_b32 v0, s13, 45 +; CHECK-NEXT: v_writelane_b32 v0, s14, 46 +; CHECK-NEXT: v_writelane_b32 v0, s15, 47 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret ; CHECK-NEXT: s_endpgm @@ -210,170 +206,166 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s1, v23, 3 ; CHECK-NEXT: v_readlane_b32 s2, v23, 4 ; CHECK-NEXT: v_readlane_b32 s3, v23, 5 -; CHECK-NEXT: v_readlane_b32 s4, v23, 6 -; CHECK-NEXT: v_readlane_b32 s5, v23, 7 -; CHECK-NEXT: v_readlane_b32 s6, v23, 8 -; CHECK-NEXT: v_readlane_b32 s7, v23, 9 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 6 +; CHECK-NEXT: v_readlane_b32 s1, v23, 7 +; CHECK-NEXT: v_readlane_b32 s2, v23, 8 +; CHECK-NEXT: v_readlane_b32 s3, v23, 9 +; CHECK-NEXT: v_readlane_b32 s4, v23, 10 +; CHECK-NEXT: v_readlane_b32 s5, v23, 11 +; CHECK-NEXT: v_readlane_b32 s6, v23, 12 +; CHECK-NEXT: v_readlane_b32 s7, v23, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 10 -; CHECK-NEXT: v_readlane_b32 s1, v23, 11 -; CHECK-NEXT: v_readlane_b32 s2, v23, 12 -; CHECK-NEXT: v_readlane_b32 s3, v23, 13 -; CHECK-NEXT: v_readlane_b32 s4, v23, 14 -; CHECK-NEXT: v_readlane_b32 s5, v23, 15 -; CHECK-NEXT: v_readlane_b32 s6, v23, 16 -; CHECK-NEXT: v_readlane_b32 s7, v23, 17 -; CHECK-NEXT: v_readlane_b32 s8, v23, 18 -; CHECK-NEXT: v_readlane_b32 s9, v23, 19 -; CHECK-NEXT: v_readlane_b32 s10, v23, 20 -; CHECK-NEXT: v_readlane_b32 s11, v23, 21 -; CHECK-NEXT: v_readlane_b32 s12, v23, 22 -; CHECK-NEXT: v_readlane_b32 s13, v23, 23 -; CHECK-NEXT: v_readlane_b32 s14, v23, 24 -; CHECK-NEXT: v_readlane_b32 s15, v23, 25 +; CHECK-NEXT: v_readlane_b32 s0, v23, 14 +; CHECK-NEXT: v_readlane_b32 s1, v23, 15 +; CHECK-NEXT: v_readlane_b32 s2, v23, 16 +; CHECK-NEXT: v_readlane_b32 s3, v23, 17 +; CHECK-NEXT: v_readlane_b32 s4, v23, 18 +; CHECK-NEXT: v_readlane_b32 s5, v23, 19 +; CHECK-NEXT: v_readlane_b32 s6, v23, 20 +; CHECK-NEXT: v_readlane_b32 s7, v23, 21 +; CHECK-NEXT: v_readlane_b32 s8, v23, 22 +; CHECK-NEXT: v_readlane_b32 s9, v23, 23 +; CHECK-NEXT: v_readlane_b32 s10, v23, 24 +; CHECK-NEXT: v_readlane_b32 s11, v23, 25 +; CHECK-NEXT: v_readlane_b32 s12, v23, 26 +; CHECK-NEXT: v_readlane_b32 s13, v23, 27 +; CHECK-NEXT: v_readlane_b32 s14, v23, 28 +; CHECK-NEXT: v_readlane_b32 s15, v23, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 26 -; CHECK-NEXT: v_readlane_b32 s1, v23, 27 -; CHECK-NEXT: v_readlane_b32 s2, v23, 28 -; CHECK-NEXT: v_readlane_b32 s3, v23, 29 -; CHECK-NEXT: v_readlane_b32 s4, v23, 30 -; CHECK-NEXT: v_readlane_b32 s5, v23, 31 -; CHECK-NEXT: v_readlane_b32 s6, v23, 32 -; CHECK-NEXT: v_readlane_b32 s7, v23, 33 +; CHECK-NEXT: v_readlane_b32 s0, v23, 30 +; CHECK-NEXT: v_readlane_b32 s1, v23, 31 +; CHECK-NEXT: v_readlane_b32 s2, v23, 32 +; CHECK-NEXT: v_readlane_b32 s3, v23, 33 +; CHECK-NEXT: v_readlane_b32 s4, v23, 34 +; CHECK-NEXT: v_readlane_b32 s5, v23, 35 +; CHECK-NEXT: v_readlane_b32 s6, v23, 36 +; CHECK-NEXT: v_readlane_b32 s7, v23, 37 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[38:39] +; CHECK-NEXT: ; use s[42:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ; use s[52:55] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 34 -; CHECK-NEXT: v_readlane_b32 s1, v23, 35 -; CHECK-NEXT: v_readlane_b32 s2, v23, 36 -; CHECK-NEXT: v_readlane_b32 s3, v23, 37 -; CHECK-NEXT: v_readlane_b32 s4, v23, 38 -; CHECK-NEXT: v_readlane_b32 s5, v23, 39 -; CHECK-NEXT: v_readlane_b32 s6, v23, 40 -; CHECK-NEXT: v_readlane_b32 s7, v23, 41 +; CHECK-NEXT: v_readlane_b32 s0, v23, 38 +; CHECK-NEXT: v_readlane_b32 s1, v23, 39 +; CHECK-NEXT: v_readlane_b32 s2, v23, 40 +; CHECK-NEXT: v_readlane_b32 s3, v23, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:37] +; CHECK-NEXT: ; use s[40:41] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[40:43] +; CHECK-NEXT: ; use s[36:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 42 -; CHECK-NEXT: v_readlane_b32 s1, v23, 43 -; CHECK-NEXT: v_readlane_b32 s2, v23, 44 -; CHECK-NEXT: v_readlane_b32 s3, v23, 45 -; CHECK-NEXT: v_readlane_b32 s4, v23, 46 -; CHECK-NEXT: v_readlane_b32 s5, v23, 47 -; CHECK-NEXT: v_readlane_b32 s6, v23, 48 -; CHECK-NEXT: v_readlane_b32 s7, v23, 49 -; CHECK-NEXT: v_readlane_b32 s8, v23, 50 -; CHECK-NEXT: v_readlane_b32 s9, v23, 51 -; CHECK-NEXT: v_readlane_b32 s10, v23, 52 -; CHECK-NEXT: v_readlane_b32 s11, v23, 53 -; CHECK-NEXT: v_readlane_b32 s12, v23, 54 -; CHECK-NEXT: v_readlane_b32 s13, v23, 55 -; CHECK-NEXT: v_readlane_b32 s14, v23, 56 -; CHECK-NEXT: v_readlane_b32 s15, v23, 57 +; CHECK-NEXT: v_readlane_b32 s4, v23, 42 +; CHECK-NEXT: v_readlane_b32 s5, v23, 43 +; CHECK-NEXT: v_readlane_b32 s6, v23, 44 +; CHECK-NEXT: v_readlane_b32 s7, v23, 45 +; CHECK-NEXT: v_readlane_b32 s8, v23, 46 +; CHECK-NEXT: v_readlane_b32 s9, v23, 47 +; CHECK-NEXT: v_readlane_b32 s10, v23, 48 +; CHECK-NEXT: v_readlane_b32 s11, v23, 49 +; CHECK-NEXT: v_readlane_b32 s12, v23, 50 +; CHECK-NEXT: v_readlane_b32 s13, v23, 51 +; CHECK-NEXT: v_readlane_b32 s14, v23, 52 +; CHECK-NEXT: v_readlane_b32 s15, v23, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 58 -; CHECK-NEXT: v_readlane_b32 s1, v23, 59 -; CHECK-NEXT: v_readlane_b32 s2, v23, 60 -; CHECK-NEXT: v_readlane_b32 s3, v23, 61 +; CHECK-NEXT: v_readlane_b32 s0, v23, 54 +; CHECK-NEXT: v_readlane_b32 s1, v23, 55 +; CHECK-NEXT: v_readlane_b32 s2, v23, 56 +; CHECK-NEXT: v_readlane_b32 s3, v23, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 62 -; CHECK-NEXT: v_readlane_b32 s1, v23, 63 -; CHECK-NEXT: v_readlane_b32 s2, v0, 0 -; CHECK-NEXT: v_readlane_b32 s3, v0, 1 -; CHECK-NEXT: v_readlane_b32 s4, v0, 2 -; CHECK-NEXT: v_readlane_b32 s5, v0, 3 -; CHECK-NEXT: v_readlane_b32 s6, v0, 4 -; CHECK-NEXT: v_readlane_b32 s7, v0, 5 +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 +; CHECK-NEXT: v_readlane_b32 s4, v23, 62 +; CHECK-NEXT: v_readlane_b32 s5, v23, 63 +; CHECK-NEXT: v_readlane_b32 s6, v0, 0 +; CHECK-NEXT: v_readlane_b32 s7, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 6 -; CHECK-NEXT: v_readlane_b32 s1, v0, 7 -; CHECK-NEXT: v_readlane_b32 s2, v0, 8 -; CHECK-NEXT: v_readlane_b32 s3, v0, 9 -; CHECK-NEXT: v_readlane_b32 s4, v0, 10 -; CHECK-NEXT: v_readlane_b32 s5, v0, 11 -; CHECK-NEXT: v_readlane_b32 s6, v0, 12 -; CHECK-NEXT: v_readlane_b32 s7, v0, 13 -; CHECK-NEXT: v_readlane_b32 s8, v0, 14 -; CHECK-NEXT: v_readlane_b32 s9, v0, 15 -; CHECK-NEXT: v_readlane_b32 s10, v0, 16 -; CHECK-NEXT: v_readlane_b32 s11, v0, 17 -; CHECK-NEXT: v_readlane_b32 s12, v0, 18 -; CHECK-NEXT: v_readlane_b32 s13, v0, 19 -; CHECK-NEXT: v_readlane_b32 s14, v0, 20 -; CHECK-NEXT: v_readlane_b32 s15, v0, 21 +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: v_readlane_b32 s4, v0, 6 +; CHECK-NEXT: v_readlane_b32 s5, v0, 7 +; CHECK-NEXT: v_readlane_b32 s6, v0, 8 +; CHECK-NEXT: v_readlane_b32 s7, v0, 9 +; CHECK-NEXT: v_readlane_b32 s8, v0, 10 +; CHECK-NEXT: v_readlane_b32 s9, v0, 11 +; CHECK-NEXT: v_readlane_b32 s10, v0, 12 +; CHECK-NEXT: v_readlane_b32 s11, v0, 13 +; CHECK-NEXT: v_readlane_b32 s12, v0, 14 +; CHECK-NEXT: v_readlane_b32 s13, v0, 15 +; CHECK-NEXT: v_readlane_b32 s14, v0, 16 +; CHECK-NEXT: v_readlane_b32 s15, v0, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 22 -; CHECK-NEXT: v_readlane_b32 s1, v0, 23 +; CHECK-NEXT: v_readlane_b32 s0, v0, 18 +; CHECK-NEXT: v_readlane_b32 s1, v0, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 20 +; CHECK-NEXT: v_readlane_b32 s1, v0, 21 +; CHECK-NEXT: v_readlane_b32 s2, v0, 22 +; CHECK-NEXT: v_readlane_b32 s3, v0, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 24 ; CHECK-NEXT: v_readlane_b32 s1, v0, 25 ; CHECK-NEXT: v_readlane_b32 s2, v0, 26 ; CHECK-NEXT: v_readlane_b32 s3, v0, 27 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 28 -; CHECK-NEXT: v_readlane_b32 s1, v0, 29 -; CHECK-NEXT: v_readlane_b32 s2, v0, 30 -; CHECK-NEXT: v_readlane_b32 s3, v0, 31 -; CHECK-NEXT: v_readlane_b32 s4, v0, 32 -; CHECK-NEXT: v_readlane_b32 s5, v0, 33 -; CHECK-NEXT: v_readlane_b32 s6, v0, 34 -; CHECK-NEXT: v_readlane_b32 s7, v0, 35 +; CHECK-NEXT: v_readlane_b32 s4, v0, 28 +; CHECK-NEXT: v_readlane_b32 s5, v0, 29 +; CHECK-NEXT: v_readlane_b32 s6, v0, 30 +; CHECK-NEXT: v_readlane_b32 s7, v0, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 36 -; CHECK-NEXT: v_readlane_b32 s1, v0, 37 -; CHECK-NEXT: v_readlane_b32 s2, v0, 38 -; CHECK-NEXT: v_readlane_b32 s3, v0, 39 -; CHECK-NEXT: v_readlane_b32 s4, v0, 40 -; CHECK-NEXT: v_readlane_b32 s5, v0, 41 -; CHECK-NEXT: v_readlane_b32 s6, v0, 42 -; CHECK-NEXT: v_readlane_b32 s7, v0, 43 -; CHECK-NEXT: v_readlane_b32 s8, v0, 44 -; CHECK-NEXT: v_readlane_b32 s9, v0, 45 -; CHECK-NEXT: v_readlane_b32 s10, v0, 46 -; CHECK-NEXT: v_readlane_b32 s11, v0, 47 -; CHECK-NEXT: v_readlane_b32 s12, v0, 48 -; CHECK-NEXT: v_readlane_b32 s13, v0, 49 -; CHECK-NEXT: v_readlane_b32 s14, v0, 50 -; CHECK-NEXT: v_readlane_b32 s15, v0, 51 +; CHECK-NEXT: v_readlane_b32 s0, v0, 32 +; CHECK-NEXT: v_readlane_b32 s1, v0, 33 +; CHECK-NEXT: v_readlane_b32 s2, v0, 34 +; CHECK-NEXT: v_readlane_b32 s3, v0, 35 +; CHECK-NEXT: v_readlane_b32 s4, v0, 36 +; CHECK-NEXT: v_readlane_b32 s5, v0, 37 +; CHECK-NEXT: v_readlane_b32 s6, v0, 38 +; CHECK-NEXT: v_readlane_b32 s7, v0, 39 +; CHECK-NEXT: v_readlane_b32 s8, v0, 40 +; CHECK-NEXT: v_readlane_b32 s9, v0, 41 +; CHECK-NEXT: v_readlane_b32 s10, v0, 42 +; CHECK-NEXT: v_readlane_b32 s11, v0, 43 +; CHECK-NEXT: v_readlane_b32 s12, v0, 44 +; CHECK-NEXT: v_readlane_b32 s13, v0, 45 +; CHECK-NEXT: v_readlane_b32 s14, v0, 46 +; CHECK-NEXT: v_readlane_b32 s15, v0, 47 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index a51b0128a3a4a..59036c64c8afc 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s13 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index f69b7ae105124..ebc916b5c889b 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -182,10 +182,8 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -205,7 +203,6 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -218,10 +215,8 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -241,7 +236,6 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -254,10 +248,8 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, 64, s4 ; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 @@ -278,7 +270,6 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -439,9 +430,6 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -514,9 +502,6 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -589,9 +574,6 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index a38f0a6d86b8c..b872112922204 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -9,9 +9,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -23,9 +20,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -44,14 +38,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -61,14 +52,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -84,9 +72,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -101,9 +86,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -122,9 +104,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -138,9 +117,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -160,9 +136,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -182,9 +155,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -213,9 +183,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -228,9 +195,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -268,14 +232,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -285,14 +246,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -325,14 +283,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -342,14 +297,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -401,14 +353,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[6:7], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_add_i32 s10, s10, s15 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -418,14 +367,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index 0a6009c0e7da8..b4a981f1db4ec 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %31.sub0 - ; GCN-NEXT: SI_SPILL_V64_SAVE %31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0 + ; GCN-NEXT: SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index b2e334c66ccd2..3644bef9c20a1 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,10 +50,7 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_add_i32 s10, s10, s15 ; HAWAII-NEXT: s_or_b32 s0, s6, 14 -; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s11 -; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s7 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] @@ -73,10 +70,7 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_add_i32 s10, s10, s15 ; FIJI-NEXT: s_or_b32 s0, s6, 14 -; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s7 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 30accc846d2b6..19d633651fdd0 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... +; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 4f84b31f1877b..2097579e0c995 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!....... +; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................ ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 5 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 0b1bd11b88d5d..775c62e73261a 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s @@ -7,17 +6,15 @@ define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: -; ASM: ; %bb.0: ; %entry -; ASM-NEXT: ;;#ASMSTART -; ASM-NEXT: ;;#ASMEND -; ASM-NEXT: s_endpgm +; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_reserve_xnack_mask 1 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting. ; OBJ: Contents of section .rodata: ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... +; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 2516177691ce3..4dfd4c095c87a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -23,14 +23,11 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 -; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_trap 2 @@ -124,9 +121,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 -; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -288,9 +282,6 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 -; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 -; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -423,13 +414,10 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 -; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 317e350f3eafe..735956caa72da 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -81,9 +81,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -255,9 +252,6 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -463,9 +457,6 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -819,9 +810,6 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1147,9 +1135,6 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,9 +1224,6 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1336,9 +1318,6 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1451,9 +1430,6 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1594,9 +1570,6 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1753,9 +1726,6 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1953,9 +1923,6 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -2138,9 +2105,6 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2254,9 +2218,6 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2320,9 +2281,6 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 @@ -2413,9 +2371,6 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; ; GCN-LABEL: fdiv_test_denormals: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: flat_load_sbyte v2, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 44b16d7f65dc5..d00ea6dff2447 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -9,9 +9,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -31,9 +28,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -60,9 +54,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -76,9 +67,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -98,9 +86,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -118,9 +103,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -144,9 +126,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -179,9 +158,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -220,9 +196,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -234,9 +207,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -252,9 +222,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -272,9 +239,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -295,9 +259,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -325,14 +286,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -342,14 +300,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -365,9 +320,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -382,9 +334,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -404,9 +353,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -419,9 +365,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -459,14 +402,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -476,14 +416,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -516,14 +453,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -533,14 +467,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -574,14 +505,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[6:7], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_add_i32 s10, s10, s15 -; SI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; SI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -591,14 +519,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_add_i32 s10, s10, s15 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s11 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 43a948b6c6ab2..a827ebe96cfcf 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s13 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 33f629a3c4f0c..4545c8bbeb3e6 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -25,9 +25,8 @@ ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 @@ -41,7 +40,7 @@ ; CHECK-NEXT: BitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: body: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 1fbd3760eed26..8215ba834170f 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -25,9 +25,8 @@ ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 @@ -41,7 +40,7 @@ ; CHECK-NEXT: BitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: body: From 3e84d162d2247846e9253758e2573ca8998adf7f Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 3 Sep 2024 16:25:45 -0700 Subject: [PATCH 05/13] Fix tests after merge from main branch --- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu | 2 +- .../AMDGPU/attributor-flatscratchinit-globalisel.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll | 8 ++++---- llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll | 2 +- llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll | 2 +- llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll | 3 ++- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll | 2 +- 7 files changed, 14 insertions(+), 13 deletions(-) diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index b295bbbdaaf95..7f016180629e4 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) { // CHECK-SPIRV-NEXT: ret void // // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S( -// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8 // OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll index b7503f26b1ab6..9efdf3cbb8606 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -4,7 +4,7 @@ ;; tests of alloca define void @without_alloca(i1 %arg0) { - store volatile i1 %arg0, ptr addrspace(1) undef + store volatile i1 %arg0, ptr addrspace(1) null ret void } @@ -15,7 +15,7 @@ define void @with_alloca() { } define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { - store volatile i1 %arg0, ptr addrspace(1) undef + store volatile i1 %arg0, ptr addrspace(1) null ret void } @@ -453,13 +453,13 @@ declare i32 @llvm.amdgcn.workgroup.id.x() define void @use_intrinsic_workitem_id_x() { %val = call i32 @llvm.amdgcn.workitem.id.x() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) null ret void } define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { %val = call i32 @llvm.amdgcn.workitem.id.x() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) null ret void } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index f04c93961b670..d32ef070b983e 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -10,7 +10,7 @@ define void @without_alloca(i1 %arg0) { ; ; GFX10-LABEL: define void @without_alloca(i1 %arg0) ; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] - store volatile i1 %arg0, ptr addrspace(1) undef + store volatile i1 %arg0, ptr addrspace(1) null ret void } @@ -31,7 +31,7 @@ define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { ; ; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) ; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] - store volatile i1 %arg0, ptr addrspace(1) undef + store volatile i1 %arg0, ptr addrspace(1) null ret void } @@ -858,7 +858,7 @@ define void @use_intrinsic_workitem_id_x() { ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x() ; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] %val = call i32 @llvm.amdgcn.workitem.id.x() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) null ret void } @@ -869,7 +869,7 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() ; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] %val = call i32 @llvm.amdgcn.workitem.id.x() - store volatile i32 %val, ptr addrspace(1) undef + store volatile i32 %val, ptr addrspace(1) null ret void } diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index ebca990699878..55113a3b6f263 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() { } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 074dba1cbcc93..df6a75c7376df 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index 8562cdf195b02..4e3fc24c4d13a 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -101,7 +101,8 @@ entry: ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; NO: [[META0]] = !{ptr @bar1, ptr @bar2} ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 183cdb2e1f862..b5f5e3cf1a1a7 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -81,7 +81,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. From bf274dc4ee02e6c24bd2855fd11d9e79970c3137 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 4 Sep 2024 14:20:24 -0700 Subject: [PATCH 06/13] Undo changes made to AMDGPUSubtarget.cpp to limit impact on tests. This undo is simply achieved by merging code from upstream because a recent commit has changed that file. The changes therein will be included in a separate PR. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 +- .../annotate-kernel-features-hsa-call.ll | 89 ++--- .../AMDGPU/annotate-kernel-features-hsa.ll | 34 +- .../AMDGPU/attributor-flatscratchinit.ll | 372 +++++++++--------- .../CodeGen/AMDGPU/direct-indirect-call.ll | 2 +- .../AMDGPU/duplicate-attribute-indirect.ll | 2 +- .../CodeGen/AMDGPU/simple-indirect-call-2.ll | 2 +- 7 files changed, 251 insertions(+), 254 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 03391a36c54c7..ddbbb73dd84e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -441,7 +441,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { } for (Instruction &I : instructions(F)) { - if (isa(I) || isa(I)) { + if (isa(I) && + cast(I).getSrcAddressSpace() == + AMDGPUAS::PRIVATE_ADDRESS) { removeAssumedBits(FLAT_SCRATCH_INIT); return; } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index ab2e28e5f5cbb..ed136c58379cb 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 -; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -645,7 +645,7 @@ define internal void @defined.func() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -658,7 +658,7 @@ define void @func_call_external() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -673,7 +673,7 @@ define void @func_call_defined() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -687,8 +687,8 @@ define void @func_call_asm() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm -; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] { -; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void asm sideeffect "", ""() #3 @@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -751,7 +751,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -788,7 +788,7 @@ define float @func_extern_call() #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] { +; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -995,7 +995,7 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() { ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] { -; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR32:[0-9]+]] +; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR29:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void @known_func() #6 @@ -1037,27 +1037,24 @@ attributes #6 = { "enqueued-block" } ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind } -; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind } +; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index ac5458f56f08b..bab5b5c31714c 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -442,9 +442,8 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] { -; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr @@ -454,7 +453,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) ; HSA-NEXT: store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -466,7 +465,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 { define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -485,7 +484,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] { +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -502,7 +501,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) % ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] { +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -513,7 +512,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) % define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) ; HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -525,7 +524,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 { define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 { ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast -; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] { +; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { ; HSA-NEXT: [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4) ; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4 ; HSA-NEXT: ret void @@ -544,7 +543,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared -; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) ; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 ; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 @@ -565,7 +564,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private -; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) ; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 ; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4 @@ -585,7 +584,7 @@ define amdgpu_kernel void @use_alloca() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca -; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -606,7 +605,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: entry: ; ATTRIBUTOR_HSA-NEXT: br label [[BB:%.*]] ; ATTRIBUTOR_HSA: bb: @@ -631,7 +630,7 @@ define void @use_alloca_func() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR14:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -664,10 +663,9 @@ attributes #1 = { nounwind } ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index d32ef070b983e..b33ec280b034c 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -6,20 +6,20 @@ define void @without_alloca(i1 %arg0) { ; GFX9-LABEL: define void @without_alloca(i1 %arg0) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI:[0-9]+]] ; ; GFX10-LABEL: define void @without_alloca(i1 %arg0) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI:[0-9]+]] store volatile i1 %arg0, ptr addrspace(1) null ret void } define void @with_alloca() { ; GFX9-LABEL: define void @with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -27,20 +27,20 @@ define void @with_alloca() { define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { ; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2:[0-9]+]] store volatile i1 %arg0, ptr addrspace(1) null ret void } define amdgpu_kernel void @with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -50,10 +50,10 @@ define amdgpu_kernel void @with_alloca_cc_kernel() { define amdgpu_vs void @with_alloca_cc_vs() { ; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS:[0-9]+]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -61,10 +61,10 @@ define amdgpu_vs void @with_alloca_cc_vs() { define amdgpu_gs void @with_alloca_cc_gs() { ; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] ; ; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -72,10 +72,10 @@ define amdgpu_gs void @with_alloca_cc_gs() { define amdgpu_ps void @with_alloca_cc_ps() { ; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] ; ; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -83,10 +83,10 @@ define amdgpu_ps void @with_alloca_cc_ps() { define amdgpu_cs void @with_alloca_cc_cs() { ; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] ; ; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -94,10 +94,10 @@ define amdgpu_cs void @with_alloca_cc_cs() { define amdgpu_hs void @with_alloca_cc_hs() { ; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] ; ; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -105,10 +105,10 @@ define amdgpu_hs void @with_alloca_cc_hs() { define amdgpu_ls void @with_alloca_cc_ls() { ; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] ; ; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -116,10 +116,10 @@ define amdgpu_ls void @with_alloca_cc_ls() { define amdgpu_es void @with_alloca_cc_es() { ; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] ; ; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -127,10 +127,10 @@ define amdgpu_es void @with_alloca_cc_es() { define amdgpu_gfx void @with_alloca_cc_gfx() { ; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]] +; GFX9-SAME: #[[ATTR3_GFX9_CC_GRAPHICS2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]] +; GFX10-SAME: #[[ATTR3_GFX10_CC_GRAPHICS2:[0-9]+]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -138,10 +138,10 @@ define amdgpu_gfx void @with_alloca_cc_gfx() { define amdgpu_cs_chain void @with_alloca_cc_cs_chain() { ; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]] +; GFX9-SAME: #[[ATTR3_GFX9_CC_GRAPHICS2]] ; ; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]] +; GFX10-SAME: #[[ATTR3_GFX10_CC_GRAPHICS2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -149,10 +149,10 @@ define amdgpu_cs_chain void @with_alloca_cc_cs_chain() { define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { ; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() -; GFX9-SAME: #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]] +; GFX9-SAME: #[[ATTR3_GFX9_CC_GRAPHICS2]] ; ; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() -; GFX10-SAME: #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]] +; GFX10-SAME: #[[ATTR3_GFX10_CC_GRAPHICS2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 ret void @@ -160,40 +160,40 @@ define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { define void @call_without_alloca() { ; GFX9-LABEL: define void @call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @without_alloca(i1 true) ret void } define amdgpu_kernel void @call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @without_alloca(i1 true) ret void } define void @call_with_alloca() { ; GFX9-LABEL: define void @call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @with_alloca() ret void } define amdgpu_kernel void @call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @with_alloca() ret void } @@ -222,50 +222,50 @@ define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { define void @call_call_without_alloca() { ; GFX9-LABEL: define void @call_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @call_without_alloca() ret void } define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @call_without_alloca() ret void } define void @call_call_with_alloca() { ; GFX9-LABEL: define void @call_call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @call_with_alloca() ret void } define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @call_with_alloca() ret void } define void @with_alloca_call_without_alloca() { ; GFX9-LABEL: define void @with_alloca_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @without_alloca() @@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() { define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @without_alloca() @@ -286,10 +286,10 @@ define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { define void @with_alloca_call_with_alloca() { ; GFX9-LABEL: define void @with_alloca_call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @with_alloca() @@ -298,10 +298,10 @@ define void @with_alloca_call_with_alloca() { define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @with_alloca() @@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { define void @with_alloca_call_call_without_alloca() { ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_without_alloca() @@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() { define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_without_alloca() @@ -334,10 +334,10 @@ define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { define void @with_alloca_call_call_with_alloca() { ; GFX9-LABEL: define void @with_alloca_call_call_with_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_alloca_call_call_with_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_with_alloca() @@ -346,10 +346,10 @@ define void @with_alloca_call_call_with_alloca() { define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %temp = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 4 call void @call_with_alloca() @@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(2) %ptr ret void } define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(2) %ptr ret void } define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %stof = addrspacecast ptr addrspace(2) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %stof = addrspacecast ptr addrspace(2) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(3) %ptr ret void } define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(3) %ptr ret void } define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -486,30 +486,30 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(4) %ptr ret void } define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(4) %ptr ret void } define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] %stof = addrspacecast ptr addrspace(4) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -517,10 +517,10 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %stof = addrspacecast ptr addrspace(4) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] store volatile i32 0, ptr addrspace(5) %ptr ret void } define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] store volatile i32 0, ptr addrspace(5) %ptr ret void } define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI:[0-9]+]] ; ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -632,70 +632,70 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -704,10 +704,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace( define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -716,10 +716,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -728,10 +728,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -742,10 +742,10 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] ; ; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] call void @without_alloca(i1 true) call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -753,10 +753,10 @@ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] call void @without_alloca(i1 true) call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -764,10 +764,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] call void @without_alloca(i1 true) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -775,10 +775,10 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] call void @without_alloca(i1 true) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -790,10 +790,10 @@ define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel( define void @with_indirect_call() { ; GFX9-LABEL: define void @with_indirect_call() -; GFX9-SAME: #[[ATTR_GFX9_IND_CALL:[0-9]+]] +; GFX9-SAME: #[[ATTR6_GFX9_IND_CALL:[0-9]+]] ; ; GFX10-LABEL: define void @with_indirect_call() -; GFX10-SAME: #[[ATTR_GFX10_IND_CALL:[0-9]+]] { +; GFX10-SAME: #[[ATTR6_GFX10_IND_CALL:[0-9]+]] { %fptr = load ptr, ptr addrspace(4) @gv.fptr0 call void %fptr() ret void @@ -801,10 +801,10 @@ define void @with_indirect_call() { define amdgpu_kernel void @with_indirect_call_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_IND_CALL2:[0-9]+]] +; GFX9-SAME: #[[ATTR7_GFX9_IND_CALL2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_IND_CALL2:[0-9]+]] +; GFX10-SAME: #[[ATTR7_GFX10_IND_CALL2:[0-9]+]] %fptr = load ptr, ptr addrspace(4) @gv.fptr0 call void %fptr() ret void @@ -812,20 +812,20 @@ define amdgpu_kernel void @with_indirect_call_cc_kernel() { define void @call_with_indirect_call() { ; GFX9-LABEL: define void @call_with_indirect_call() -; GFX9-SAME: #[[ATTR_GFX9_IND_CALL:[0-9]+]] +; GFX9-SAME: #[[ATTR6_GFX9_IND_CALL]] ; ; GFX10-LABEL: define void @call_with_indirect_call() -; GFX10-SAME: #[[ATTR_GFX10_IND_CALL:[0-9]+]] +; GFX10-SAME: #[[ATTR6_GFX10_IND_CALL]] call void @with_indirect_call() ret void } define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_IND_CALL2:[0-9]+]] +; GFX9-SAME: #[[ATTR7_GFX9_IND_CALL2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_IND_CALL2:[0-9]+]] +; GFX10-SAME: #[[ATTR7_GFX10_IND_CALL2]] call void @with_indirect_call() ret void } @@ -840,10 +840,10 @@ define void @also_empty() { define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) { ; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR8_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR8_GFX10_NOFSI3:[0-9]+]] %fptr = select i1 %cond, ptr @empty, ptr @also_empty call void %fptr() ret void @@ -853,10 +853,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x() define void @use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR10_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR10_GFX10_NOFSI4:[0-9]+]] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) null ret void @@ -864,10 +864,10 @@ define void @use_intrinsic_workitem_id_x() { define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) null ret void @@ -875,30 +875,30 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { define void @call_use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI4]] +; GFX9-SAME: #[[ATTR10_GFX9_NOFSI4]] ; ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI4]] +; GFX10-SAME: #[[ATTR10_GFX10_NOFSI4]] call void @use_intrinsic_workitem_id_x() ret void } define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR11_GFX9_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR11_GFX10_NOFSI5:[0-9]+]] call void @use_intrinsic_workitem_id_x() ret void } define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) store volatile i32 7, ptr %1, align 4 ret void @@ -906,65 +906,65 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) ret void } define amdgpu_kernel void @with_inline_asm() { ; GFX9-LABEL: with_inline_asm -; GFX9-SAME: #[[ATTR_GFX9_NOFSI3]] +; GFX9-SAME: #[[ATTR8_GFX9_NOFSI3]] ; ; GFX10-LABEL: with_inline_asm -; GFX10-SAME: #[[ATTR_GFX10_NOFSI3]] +; GFX10-SAME: #[[ATTR8_GFX10_NOFSI3]] call void asm sideeffect "; use $0", "a"(i32 poison) ret void } -; GFX9: attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR0_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR1_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR2_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR3_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR4_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR5_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR6_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR7_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR8_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR10_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR11_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR0_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR1_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR2_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR3_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR4_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR5_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR6_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR7_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR8_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR10_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR11_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 55113a3b6f263..c108b93766bcc 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() { } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index df6a75c7376df..908746a0c7784 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index 4e3fc24c4d13a..5326c9e2c12f0 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -102,7 +102,7 @@ entry: ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } -; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; NO: [[META0]] = !{ptr @bar1, ptr @bar2} ;. From 703eecc9184857e80125b36e1c77f13ca78b55ba Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 4 Sep 2024 16:50:38 -0700 Subject: [PATCH 07/13] Code improvement in the lambda. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index ddbbb73dd84e1..9a7752b0f63bb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -707,15 +707,16 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { const auto &CB = cast(I); const Function *Callee = CB.getCalledFunction(); - if (Callee && Callee->isIntrinsic()) + // Callee == 0 for inline asm or indirect call with known callees. + // In the latter case, updateImpl() already checked the callees and we + // know their FLAT_SCRATCH_INIT bit is set. + // If function has indirect call with unknown callees, the bit is + // already removed in updateImpl() and execution won't reach here. + if (!Callee) + return true; + else return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull; - - // Return true for all other cases, including (1)inline asm, (2)direct - // call, and (3)indirect call with known callees. For (2) and (3) - // updateImpl() already checked the callees and we know their - // FLAT_SCRATCH_INIT bit is set. - return true; }; bool UsedAssumedInformation = false; From a5972c657950ec7b62cfbc974dce8764f773abc2 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 18 Sep 2024 15:55:50 -0700 Subject: [PATCH 08/13] (1) check constant for addrspacecast (2) remove alloca related tests. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 41 +- .../attributor-flatscratchinit-globalisel.ll | 443 -------------- .../AMDGPU/attributor-flatscratchinit.ll | 556 +++--------------- 3 files changed, 133 insertions(+), 907 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9a7752b0f63bb..6f64c1cdb8094 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -440,13 +440,24 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return; } + SmallPtrSet VisitedConsts; + for (Instruction &I : instructions(F)) { if (isa(I) && - cast(I).getSrcAddressSpace() == + cast(I).getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { removeAssumedBits(FLAT_SCRATCH_INIT); return; } + // check for addrSpaceCast in constant expressions + for (const Use &U : I.operands()) { + if (const auto *C = dyn_cast(U)) { + if (constHasASCast(C, VisitedConsts)) { + removeAssumedBits(FLAT_SCRATCH_INIT); + return; + } + } + } } } @@ -714,9 +725,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // already removed in updateImpl() and execution won't reach here. if (!Callee) return true; - else - return Callee->getIntrinsicID() != - Intrinsic::amdgcn_addrspacecast_nonnull; + + return Callee->getIntrinsicID() != + Intrinsic::amdgcn_addrspacecast_nonnull; }; bool UsedAssumedInformation = false; @@ -726,6 +737,28 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this, UsedAssumedInformation); } + + bool constHasASCast(const Constant *C, + SmallPtrSetImpl &Visited) { + if (!Visited.insert(C).second) + return false; + + if (const auto *CE = dyn_cast(C)) + if (CE->getOpcode() == Instruction::AddrSpaceCast && + CE->getOperand(0)->getType()->getPointerAddressSpace() == + AMDGPUAS::PRIVATE_ADDRESS) + return true; + + for (const Use &U : C->operands()) { + const auto *OpC = dyn_cast(U); + if (!OpC || !Visited.insert(OpC).second) + continue; + + if (constHasASCast(OpC, Visited)) + return true; + } + return false; + } }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll index 9efdf3cbb8606..9ae5782298a28 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -1,200 +1,6 @@ ; Test the generation of the attribute amdgpu-no-flat-scratch-init ; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s -;; tests of alloca - -define void @without_alloca(i1 %arg0) { - store volatile i1 %arg0, ptr addrspace(1) null - ret void -} - -define void @with_alloca() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { - store volatile i1 %arg0, ptr addrspace(1) null - ret void -} - -define amdgpu_kernel void @with_alloca_cc_kernel() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -; graphics functions won't get the attribute amdgpu-no-flat-scratch-init - -define amdgpu_vs void @with_alloca_cc_vs() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_gs void @with_alloca_cc_gs() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_ps void @with_alloca_cc_ps() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_cs void @with_alloca_cc_cs() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_hs void @with_alloca_cc_hs() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_ls void @with_alloca_cc_ls() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_es void @with_alloca_cc_es() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_gfx void @with_alloca_cc_gfx() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_cs_chain void @with_alloca_cc_cs_chain() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define void @call_without_alloca() { - call void @without_alloca(i1 true) - ret void -} - -define amdgpu_kernel void @call_without_alloca_cc_kernel() { - call void @without_alloca(i1 true) - ret void -} - -define void @call_with_alloca() { - call void @with_alloca() - ret void -} - -define amdgpu_kernel void @call_with_alloca_cc_kernel() { - call void @with_alloca() - ret void -} - -define void @call_both_with_and_without_alloca() { - call void @with_alloca() - call void @without_alloca() - ret void -} - -define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { - call void @with_alloca() - call void @without_alloca() - ret void -} - -define void @call_call_without_alloca() { - call void @call_without_alloca() - ret void -} - -define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { - call void @call_without_alloca() - ret void -} - -define void @call_call_with_alloca() { - call void @call_with_alloca() - ret void -} - -define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { - call void @call_with_alloca() - ret void -} - -define void @with_alloca_call_without_alloca() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @without_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @without_alloca() - ret void -} - -define void @with_alloca_call_with_alloca() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @with_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @with_alloca() - ret void -} - -define void @with_alloca_call_call_without_alloca() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_without_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_without_alloca() - ret void -} - -define void @with_alloca_call_call_with_alloca() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_with_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_with_alloca() - ret void -} - ;; tests of addrspacecast define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { @@ -397,32 +203,6 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_ ret void } -;; tests of mixed alloca and addrspacecast - -define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) { - call void @without_alloca(i1 true) - call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) - ret void -} - -define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { - call void @without_alloca(i1 true) - call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) - ret void -} - -define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { - call void @without_alloca(i1 true) - call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) - ret void -} - -define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { - call void @without_alloca(i1 true) - call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) - ret void -} - ;; tests of indirect call, intrinsics @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 @@ -473,201 +253,6 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ret void } -; GFX10: name: without_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: with_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: without_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: with_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: with_alloca_cc_vs -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } -; -; GFX10: name: with_alloca_cc_gs -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr5' } -; -; GFX10: name: with_alloca_cc_ps -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } -; -; GFX10: name: with_alloca_cc_cs -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } -; -; GFX10: name: with_alloca_cc_hs -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr5' } -; -; GFX10: name: with_alloca_cc_ls -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } -; -; GFX10: name: with_alloca_cc_es -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr0' } -; -; GFX10: name: with_alloca_cc_gfx -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: psInputAddr: 0 -; -; GFX10: name: with_alloca_cc_cs_chain -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' } -; GFX10-NEXT: psInputAddr: 0 -; -; GFX10: name: with_alloca_cc_cs_chain_preserve -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' } -; GFX10-NEXT: psInputAddr: 0 -; -; GFX10: name: call_without_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_without_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } -; -; GFX10: name: call_with_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_with_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: call_both_with_and_without_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_both_with_and_without_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: call_call_without_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_call_without_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } -; -; GFX10: name: call_call_with_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_call_with_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: with_alloca_call_without_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: with_alloca_call_without_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: with_alloca_call_with_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: with_alloca_call_with_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: with_alloca_call_call_without_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: with_alloca_call_call_without_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: with_alloca_call_call_with_alloca -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: with_alloca_call_call_with_alloca_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: flatScratchInit: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; ; GFX10: name: without_global_to_flat_addrspacecast ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } @@ -921,34 +506,6 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } ; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } ; -; GFX10: name: call_without_alloca_and_without_addrspacecast -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_without_alloca_and_without_addrspacecast_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; -; GFX10: name: call_without_alloca_and_with_addrspacecast -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } -; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } -; -; GFX10: name: call_without_alloca_and_with_addrspacecast_cc_kernel -; GFX10: argumentInfo: -; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } -; ; GFX10: name: with_indirect_call ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index b33ec280b034c..2229ce9500cac 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -2,378 +2,24 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s -;; tests of alloca - -define void @without_alloca(i1 %arg0) { -; GFX9-LABEL: define void @without_alloca(i1 %arg0) -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI:[0-9]+]] -; -; GFX10-LABEL: define void @without_alloca(i1 %arg0) -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI:[0-9]+]] - store volatile i1 %arg0, ptr addrspace(1) null - ret void -} - -define void @with_alloca() { -; GFX9-LABEL: define void @with_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @with_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) { -; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2:[0-9]+]] -; -; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2:[0-9]+]] - store volatile i1 %arg0, ptr addrspace(1) null - ret void -} - -define amdgpu_kernel void @with_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -; graphics functions won't get the attribute amdgpu-no-flat-scratch-init - -define amdgpu_vs void @with_alloca_cc_vs() { -; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS:[0-9]+]] -; -; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS:[0-9]+]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_gs void @with_alloca_cc_gs() { -; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] -; -; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_ps void @with_alloca_cc_ps() { -; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] -; -; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_cs void @with_alloca_cc_cs() { -; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] -; -; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_hs void @with_alloca_cc_hs() { -; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] -; -; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_ls void @with_alloca_cc_ls() { -; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] -; -; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_es void @with_alloca_cc_es() { -; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es() -; GFX9-SAME: #[[ATTR2_GFX9_CC_GRAPHICS]] -; -; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es() -; GFX10-SAME: #[[ATTR2_GFX10_CC_GRAPHICS]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_gfx void @with_alloca_cc_gfx() { -; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx() -; GFX9-SAME: #[[ATTR3_GFX9_CC_GRAPHICS2:[0-9]+]] -; -; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx() -; GFX10-SAME: #[[ATTR3_GFX10_CC_GRAPHICS2:[0-9]+]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_cs_chain void @with_alloca_cc_cs_chain() { -; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain() -; GFX9-SAME: #[[ATTR3_GFX9_CC_GRAPHICS2]] -; -; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain() -; GFX10-SAME: #[[ATTR3_GFX10_CC_GRAPHICS2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() { -; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() -; GFX9-SAME: #[[ATTR3_GFX9_CC_GRAPHICS2]] -; -; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() -; GFX10-SAME: #[[ATTR3_GFX10_CC_GRAPHICS2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - ret void -} - -define void @call_without_alloca() { -; GFX9-LABEL: define void @call_without_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @call_without_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - call void @without_alloca(i1 true) - ret void -} - -define amdgpu_kernel void @call_without_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - call void @without_alloca(i1 true) - ret void -} - -define void @call_with_alloca() { -; GFX9-LABEL: define void @call_with_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @call_with_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - call void @with_alloca() - ret void -} - -define amdgpu_kernel void @call_with_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - call void @with_alloca() - ret void -} - -define void @call_both_with_and_without_alloca() { -; GFX9-LABEL: define void @call_both_with_and_without_alloca() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]] -; -; GFX10-LABEL: define void @call_both_with_and_without_alloca() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]] - call void @with_alloca() - call void @without_alloca() - ret void -} - -define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]] - call void @with_alloca() - call void @without_alloca() - ret void -} - -define void @call_call_without_alloca() { -; GFX9-LABEL: define void @call_call_without_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @call_call_without_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - call void @call_without_alloca() - ret void -} - -define amdgpu_kernel void @call_call_without_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - call void @call_without_alloca() - ret void -} - -define void @call_call_with_alloca() { -; GFX9-LABEL: define void @call_call_with_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @call_call_with_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - call void @call_with_alloca() - ret void -} - -define amdgpu_kernel void @call_call_with_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - call void @call_with_alloca() - ret void -} - -define void @with_alloca_call_without_alloca() { -; GFX9-LABEL: define void @with_alloca_call_without_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @with_alloca_call_without_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @without_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @without_alloca() - ret void -} - -define void @with_alloca_call_with_alloca() { -; GFX9-LABEL: define void @with_alloca_call_with_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @with_alloca_call_with_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @with_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @with_alloca() - ret void -} - -define void @with_alloca_call_call_without_alloca() { -; GFX9-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @with_alloca_call_call_without_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_without_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_without_alloca() - ret void -} - -define void @with_alloca_call_call_with_alloca() { -; GFX9-LABEL: define void @with_alloca_call_call_with_alloca() -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @with_alloca_call_call_with_alloca() -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_with_alloca() - ret void -} - -define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() { -; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - %temp = alloca i32, addrspace(5) - store volatile i32 0, ptr addrspace(5) %temp, align 4 - call void @call_with_alloca() - ret void -} - ;; tests of addrspacecast define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] +; GFX9-SAME: #[[ATTR0_GFX9_NOFSI:[0-9]+]] ; ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] +; GFX10-SAME: #[[ATTR0_GFX10_NOFSI:[0-9]+]] store volatile i32 0, ptr addrspace(1) %ptr ret void } define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] +; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2:[0-9]+]] store volatile i32 0, ptr addrspace(1) %ptr ret void } @@ -548,10 +194,10 @@ define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr a define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI:[0-9]+]] ; ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI:[0-9]+]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -559,10 +205,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2:[0-9]+]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2:[0-9]+]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2:[0-9]+]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof ret void @@ -590,30 +236,30 @@ define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel( define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -621,10 +267,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -652,50 +298,50 @@ define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_ke define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -704,10 +350,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace( define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -716,10 +362,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] +; GFX9-SAME: #[[ATTR2_GFX9_NO_NOFSI]] ; ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] +; GFX10-SAME: #[[ATTR2_GFX10_NO_NOFSI]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) @@ -728,59 +374,55 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] %stof = addrspacecast ptr addrspace(5) %ptr to ptr store volatile i32 0, ptr %stof call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void } -;; tests of mixed alloca and addrspacecast +;; tests of addrspacecast in a constant -define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) { -; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR0_GFX9_NOFSI]] -; -; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR0_GFX10_NOFSI]] - call void @without_alloca(i1 true) - call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +define amdgpu_kernel void @with_global_to_flat_in_const() { +; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_in_const() +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] + store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(1) null to ptr addrspace(0)) ret void } -define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { -; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) +define amdgpu_kernel void @with_region_to_flat_in_const() { +; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_in_const() ; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR1_GFX10_NOFSI2]] - call void @without_alloca(i1 true) - call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(2) null to ptr addrspace(0)) ret void } -define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) { -; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR4_GFX9_NO_NOFSI]] -; -; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR4_GFX10_NO_NOFSI]] - call void @without_alloca(i1 true) - call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +define amdgpu_kernel void @with_local_to_flat_in_const() { +; GFX9-LABEL: define amdgpu_kernel void @with_local_to_flat_in_const() +; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] + store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0)) ret void } -define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { -; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] -; -; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] - call void @without_alloca(i1 true) - call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) +define amdgpu_kernel void @with_constant_to_flat_in_const() { + store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0)) + ret void +} + +define amdgpu_kernel void @with_private_to_flat_in_const() { +; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_in_const() +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]] + store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(5) null to ptr addrspace(0)) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_in_const() { +; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_in_const() +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]] + call void @with_private_to_flat_in_const() ret void } @@ -790,10 +432,10 @@ define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel( define void @with_indirect_call() { ; GFX9-LABEL: define void @with_indirect_call() -; GFX9-SAME: #[[ATTR6_GFX9_IND_CALL:[0-9]+]] +; GFX9-SAME: #[[ATTR2_GFX9_IND_CALL:[0-9]+]] ; ; GFX10-LABEL: define void @with_indirect_call() -; GFX10-SAME: #[[ATTR6_GFX10_IND_CALL:[0-9]+]] { +; GFX10-SAME: #[[ATTR2_GFX10_IND_CALL:[0-9]+]] { %fptr = load ptr, ptr addrspace(4) @gv.fptr0 call void %fptr() ret void @@ -801,10 +443,10 @@ define void @with_indirect_call() { define amdgpu_kernel void @with_indirect_call_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel() -; GFX9-SAME: #[[ATTR7_GFX9_IND_CALL2:[0-9]+]] +; GFX9-SAME: #[[ATTR3_GFX9_IND_CALL2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel() -; GFX10-SAME: #[[ATTR7_GFX10_IND_CALL2:[0-9]+]] +; GFX10-SAME: #[[ATTR3_GFX10_IND_CALL2:[0-9]+]] %fptr = load ptr, ptr addrspace(4) @gv.fptr0 call void %fptr() ret void @@ -812,20 +454,20 @@ define amdgpu_kernel void @with_indirect_call_cc_kernel() { define void @call_with_indirect_call() { ; GFX9-LABEL: define void @call_with_indirect_call() -; GFX9-SAME: #[[ATTR6_GFX9_IND_CALL]] +; GFX9-SAME: #[[ATTR4_GFX9_IND_CALL:[0-9]+]] ; ; GFX10-LABEL: define void @call_with_indirect_call() -; GFX10-SAME: #[[ATTR6_GFX10_IND_CALL]] +; GFX10-SAME: #[[ATTR4_GFX10_IND_CALL:[0-9]+]] call void @with_indirect_call() ret void } define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel() -; GFX9-SAME: #[[ATTR7_GFX9_IND_CALL2]] +; GFX9-SAME: #[[ATTR5_GFX9_IND_CALL2:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel() -; GFX10-SAME: #[[ATTR7_GFX10_IND_CALL2]] +; GFX10-SAME: #[[ATTR5_GFX10_IND_CALL2:[0-9]+]] call void @with_indirect_call() ret void } @@ -840,10 +482,10 @@ define void @also_empty() { define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) { ; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) -; GFX9-SAME: #[[ATTR8_GFX9_NOFSI3:[0-9]+]] +; GFX9-SAME: #[[ATTR6_GFX9_NOFSI3:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) -; GFX10-SAME: #[[ATTR8_GFX10_NOFSI3:[0-9]+]] +; GFX10-SAME: #[[ATTR6_GFX10_NOFSI3:[0-9]+]] %fptr = select i1 %cond, ptr @empty, ptr @also_empty call void %fptr() ret void @@ -853,10 +495,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x() define void @use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR10_GFX9_NOFSI4:[0-9]+]] +; GFX9-SAME: #[[ATTR8_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR10_GFX10_NOFSI4:[0-9]+]] +; GFX10-SAME: #[[ATTR8_GFX10_NOFSI4:[0-9]+]] %val = call i32 @llvm.amdgcn.workitem.id.x() store volatile i32 %val, ptr addrspace(1) null ret void @@ -875,30 +517,30 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { define void @call_use_intrinsic_workitem_id_x() { ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX9-SAME: #[[ATTR10_GFX9_NOFSI4]] +; GFX9-SAME: #[[ATTR6_GFX9_NOFSI4:[0-9]+]] ; ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x() -; GFX10-SAME: #[[ATTR10_GFX10_NOFSI4]] +; GFX10-SAME: #[[ATTR6_GFX10_NOFSI4:[0-9]+]] call void @use_intrinsic_workitem_id_x() ret void } define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX9-SAME: #[[ATTR11_GFX9_NOFSI5:[0-9]+]] +; GFX9-SAME: #[[ATTR9_GFX9_NOFSI5:[0-9]+]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() -; GFX10-SAME: #[[ATTR11_GFX10_NOFSI5:[0-9]+]] +; GFX10-SAME: #[[ATTR9_GFX10_NOFSI5:[0-9]+]] call void @use_intrinsic_workitem_id_x() ret void } define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) store volatile i32 7, ptr %1, align 4 ret void @@ -906,20 +548,20 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { ; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX9-SAME: #[[ATTR5_GFX9_NO_NOFSI2]] +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] ; ; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) -; GFX10-SAME: #[[ATTR5_GFX10_NO_NOFSI2]] +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) ret void } define amdgpu_kernel void @with_inline_asm() { ; GFX9-LABEL: with_inline_asm -; GFX9-SAME: #[[ATTR8_GFX9_NOFSI3]] +; GFX9-SAME: #[[ATTR6_GFX9_NOFSI3]] ; ; GFX10-LABEL: with_inline_asm -; GFX10-SAME: #[[ATTR8_GFX10_NOFSI3]] +; GFX10-SAME: #[[ATTR6_GFX10_NOFSI3]] call void asm sideeffect "; use $0", "a"(i32 poison) ret void } @@ -928,21 +570,18 @@ define amdgpu_kernel void @with_inline_asm() { ; GFX9: attributes #[[ATTR1_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR2_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR3_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } - -; GFX9: attributes #[[ATTR4_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR2_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR5_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR3_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR6_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR7_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR4_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR5_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR8_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR6_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR10_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR8_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; GFX9: attributes #[[ATTR11_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR9_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } @@ -953,18 +592,15 @@ define amdgpu_kernel void @with_inline_asm() { ; GFX10: attributes #[[ATTR1_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR2_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR3_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } - -; GFX10: attributes #[[ATTR4_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR2_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR5_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR3_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR6_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR7_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR4_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR5_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR8_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR6_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR10_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR8_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } -; GFX10: attributes #[[ATTR11_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR9_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } From b3c81f9ea990305d77c0d0fc940ae355d1a7301f Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 4 Oct 2024 15:22:12 -0700 Subject: [PATCH 09/13] Add a testcase for addrspacecast inside a constant. --- .../AMDGPU/attributor-flatscratchinit.ll | 44 +++---------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll index 2229ce9500cac..6984bf45a4bb4 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -386,43 +386,13 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_ ;; tests of addrspacecast in a constant -define amdgpu_kernel void @with_global_to_flat_in_const() { -; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_in_const() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] - store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(1) null to ptr addrspace(0)) - ret void -} - -define amdgpu_kernel void @with_region_to_flat_in_const() { -; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_in_const() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] - store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(2) null to ptr addrspace(0)) - ret void -} - -define amdgpu_kernel void @with_local_to_flat_in_const() { -; GFX9-LABEL: define amdgpu_kernel void @with_local_to_flat_in_const() -; GFX9-SAME: #[[ATTR1_GFX9_NOFSI2]] - store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0)) - ret void -} - -define amdgpu_kernel void @with_constant_to_flat_in_const() { - store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0)) - ret void -} - -define amdgpu_kernel void @with_private_to_flat_in_const() { -; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_in_const() -; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]] - store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(5) null to ptr addrspace(0)) - ret void -} - -define amdgpu_kernel void @call_with_private_to_flat_in_const() { -; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_in_const() -; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]] - call void @with_private_to_flat_in_const() +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) { +; GFX9-LABEL: define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) +; GFX9-SAME: #[[ATTR3_GFX9_NO_NOFSI2]] +; +; GFX10-LABEL: define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) +; GFX10-SAME: #[[ATTR3_GFX10_NO_NOFSI2]] + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 ret void } From 09012f48c8a5242db946656c905a60df66b820be Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 7 Oct 2024 10:40:51 -0700 Subject: [PATCH 10/13] Update tests after merging from main. --- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 41 +- ...amdgpu-codegenprepare-fold-binop-select.ll | 7 +- .../AMDGPU/annotate-kernel-features-hsa.ll | 9 +- .../AMDGPU/call-graph-register-usage.ll | 12 +- llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 52 +-- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +- llvm/test/CodeGen/AMDGPU/ipra.ll | 2 +- llvm/test/CodeGen/AMDGPU/min.ll | 1 + ...al-regcopy-and-spill-missed-at-regalloc.ll | 27 +- .../AMDGPU/remove-no-kernel-id-attribute.ll | 2 +- .../scc-clobbered-sgpr-to-vmem-spill.ll | 442 +++++++++--------- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 6 +- 12 files changed, 309 insertions(+), 316 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 62f6890e92662..7336543b41cbc 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -63,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 { ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}} -; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}} -; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]] -; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] -; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] -; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 -; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 -; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen +; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} + +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 +; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 +; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 + +; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base + +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: s_cmp_lg_u32 [[PTR]], -1 +; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 +; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 + +; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] ; HSA: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .amdhsa_user_sgpr_dispatch_ptr 0 @@ -252,11 +259,8 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { ; FIXME: Shouldn't need to enable queue ptr ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: -; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] -; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] -; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 -; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5 -; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 +; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]] define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { @@ -277,12 +281,7 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { ; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast: -; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] -; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] -; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 -; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5 -; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 -; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} +; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index bf72cccd912ce..de318e7ae31a5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -393,12 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0x83 -; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_movk_i32 s0, 0x80 +; GCN-NEXT: s_cselect_b32 s0, s0, 0x83 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm %select = select i1 %cond, i16 5, i16 8 diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index bab5b5c31714c..239bdfde323cf 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -443,7 +443,8 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr @@ -485,7 +486,8 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(1) %ptr to ptr @@ -502,7 +504,8 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) % ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { -; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(4) %ptr to ptr diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 51fff3444324f..dbd00f09943c0 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: -; CI: ; NumSgprs: 38 -; VI-NOBUG: ; NumSgprs: 40 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 38 +; VI-NOBUG: ; TotalNumSgprs: 40 +; VI-BUG: ; TotalNumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() @@ -121,9 +121,9 @@ define void @indirect_use_80_sgpr() #1 { } ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: -; CI: ; NumSgprs: 84 -; VI-NOBUG: ; NumSgprs: 86 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 84 +; VI-NOBUG: ; TotalNumSgprs: 86 +; VI-BUG: ; TotalNumSgprs: 96 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index e4ffedd686ac9..6deab28fbe2e9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -25,11 +25,11 @@ ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 8 +; CI: ; TotalNumSgprs: 8 +; VI-NOXNACK: ; TotalNumSgprs: 8 +; VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -42,11 +42,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 10 +; CI: ; TotalNumSgprs: 10 +; VI-NOXNACK: ; TotalNumSgprs: 10 +; VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -59,11 +59,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 12 -; VI-NOXNACK: ; NumSgprs: 14 -; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 8 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -76,11 +76,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 12 -; VI-NOXNACK: ; NumSgprs: 14 -; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 10 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -99,8 +99,8 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -116,8 +116,8 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -133,8 +133,8 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 4d62d30a38ed3..292722c2607ad 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %11 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %9 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %11 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %9 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -46,16 +46,16 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %11 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %9 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 758e8b6e1e2ee..957f404c8cdbe 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ define hidden void @func() #1 { ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 37 +; GCN: ; TotalNumSgprs: 37 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 02ce58dd75403..05ef2698c1f77 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -591,6 +591,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[6:7], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s3, s[6:7], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 5a8d0f3d0f158..b40d35dbd8ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,19 +11,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26 - ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23 - ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7 + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 @@ -60,18 +56,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25 - ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23 - ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7 + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 58a6437bcbf69..8792e60bb0ca1 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) { define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion( -; CHECK-SAME: ) #[[ATTR6:[0-9]+]] !llvm.amdgcn.lds.kernel.id !9 { +; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ] ; CHECK-NEXT: call void @mutual_recursion_0(i16 0) ; CHECK-NEXT: ret void diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 5dcb4b7c979ab..7f8240eeb98eb 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -26,40 +26,40 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 2 -; CHECK-NEXT: v_writelane_b32 v23, s5, 3 -; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: v_writelane_b32 v22, s4, 2 +; CHECK-NEXT: v_writelane_b32 v22, s5, 3 +; CHECK-NEXT: v_writelane_b32 v22, s6, 4 +; CHECK-NEXT: v_writelane_b32 v22, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 +; CHECK-NEXT: v_writelane_b32 v22, s4, 6 +; CHECK-NEXT: v_writelane_b32 v22, s5, 7 +; CHECK-NEXT: v_writelane_b32 v22, s6, 8 +; CHECK-NEXT: v_writelane_b32 v22, s7, 9 +; CHECK-NEXT: v_writelane_b32 v22, s8, 10 +; CHECK-NEXT: v_writelane_b32 v22, s9, 11 +; CHECK-NEXT: v_writelane_b32 v22, s10, 12 +; CHECK-NEXT: v_writelane_b32 v22, s11, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 14 -; CHECK-NEXT: v_writelane_b32 v23, s5, 15 -; CHECK-NEXT: v_writelane_b32 v23, s6, 16 -; CHECK-NEXT: v_writelane_b32 v23, s7, 17 -; CHECK-NEXT: v_writelane_b32 v23, s8, 18 -; CHECK-NEXT: v_writelane_b32 v23, s9, 19 -; CHECK-NEXT: v_writelane_b32 v23, s10, 20 -; CHECK-NEXT: v_writelane_b32 v23, s11, 21 -; CHECK-NEXT: v_writelane_b32 v23, s12, 22 -; CHECK-NEXT: v_writelane_b32 v23, s13, 23 -; CHECK-NEXT: v_writelane_b32 v23, s14, 24 -; CHECK-NEXT: v_writelane_b32 v23, s15, 25 -; CHECK-NEXT: v_writelane_b32 v23, s16, 26 -; CHECK-NEXT: v_writelane_b32 v23, s17, 27 -; CHECK-NEXT: v_writelane_b32 v23, s18, 28 -; CHECK-NEXT: v_writelane_b32 v23, s19, 29 +; CHECK-NEXT: v_writelane_b32 v22, s4, 14 +; CHECK-NEXT: v_writelane_b32 v22, s5, 15 +; CHECK-NEXT: v_writelane_b32 v22, s6, 16 +; CHECK-NEXT: v_writelane_b32 v22, s7, 17 +; CHECK-NEXT: v_writelane_b32 v22, s8, 18 +; CHECK-NEXT: v_writelane_b32 v22, s9, 19 +; CHECK-NEXT: v_writelane_b32 v22, s10, 20 +; CHECK-NEXT: v_writelane_b32 v22, s11, 21 +; CHECK-NEXT: v_writelane_b32 v22, s12, 22 +; CHECK-NEXT: v_writelane_b32 v22, s13, 23 +; CHECK-NEXT: v_writelane_b32 v22, s14, 24 +; CHECK-NEXT: v_writelane_b32 v22, s15, 25 +; CHECK-NEXT: v_writelane_b32 v22, s16, 26 +; CHECK-NEXT: v_writelane_b32 v22, s17, 27 +; CHECK-NEXT: v_writelane_b32 v22, s18, 28 +; CHECK-NEXT: v_writelane_b32 v22, s19, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[42:43] ; CHECK-NEXT: ;;#ASMEND @@ -69,14 +69,14 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 30 -; CHECK-NEXT: v_writelane_b32 v23, s5, 31 -; CHECK-NEXT: v_writelane_b32 v23, s6, 32 -; CHECK-NEXT: v_writelane_b32 v23, s7, 33 -; CHECK-NEXT: v_writelane_b32 v23, s8, 34 -; CHECK-NEXT: v_writelane_b32 v23, s9, 35 -; CHECK-NEXT: v_writelane_b32 v23, s10, 36 -; CHECK-NEXT: v_writelane_b32 v23, s11, 37 +; CHECK-NEXT: v_writelane_b32 v22, s4, 30 +; CHECK-NEXT: v_writelane_b32 v22, s5, 31 +; CHECK-NEXT: v_writelane_b32 v22, s6, 32 +; CHECK-NEXT: v_writelane_b32 v22, s7, 33 +; CHECK-NEXT: v_writelane_b32 v22, s8, 34 +; CHECK-NEXT: v_writelane_b32 v22, s9, 35 +; CHECK-NEXT: v_writelane_b32 v22, s10, 36 +; CHECK-NEXT: v_writelane_b32 v22, s11, 37 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART @@ -94,105 +94,105 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 38 -; CHECK-NEXT: v_writelane_b32 v23, s1, 39 -; CHECK-NEXT: v_writelane_b32 v23, s2, 40 -; CHECK-NEXT: v_writelane_b32 v23, s3, 41 -; CHECK-NEXT: v_writelane_b32 v23, s4, 42 -; CHECK-NEXT: v_writelane_b32 v23, s5, 43 -; CHECK-NEXT: v_writelane_b32 v23, s6, 44 -; CHECK-NEXT: v_writelane_b32 v23, s7, 45 -; CHECK-NEXT: v_writelane_b32 v23, s8, 46 -; CHECK-NEXT: v_writelane_b32 v23, s9, 47 -; CHECK-NEXT: v_writelane_b32 v23, s10, 48 -; CHECK-NEXT: v_writelane_b32 v23, s11, 49 -; CHECK-NEXT: v_writelane_b32 v23, s12, 50 -; CHECK-NEXT: v_writelane_b32 v23, s13, 51 -; CHECK-NEXT: v_writelane_b32 v23, s14, 52 -; CHECK-NEXT: v_writelane_b32 v23, s15, 53 +; CHECK-NEXT: v_writelane_b32 v22, s0, 38 +; CHECK-NEXT: v_writelane_b32 v22, s1, 39 +; CHECK-NEXT: v_writelane_b32 v22, s2, 40 +; CHECK-NEXT: v_writelane_b32 v22, s3, 41 +; CHECK-NEXT: v_writelane_b32 v22, s4, 42 +; CHECK-NEXT: v_writelane_b32 v22, s5, 43 +; CHECK-NEXT: v_writelane_b32 v22, s6, 44 +; CHECK-NEXT: v_writelane_b32 v22, s7, 45 +; CHECK-NEXT: v_writelane_b32 v22, s8, 46 +; CHECK-NEXT: v_writelane_b32 v22, s9, 47 +; CHECK-NEXT: v_writelane_b32 v22, s10, 48 +; CHECK-NEXT: v_writelane_b32 v22, s11, 49 +; CHECK-NEXT: v_writelane_b32 v22, s12, 50 +; CHECK-NEXT: v_writelane_b32 v22, s13, 51 +; CHECK-NEXT: v_writelane_b32 v22, s14, 52 +; CHECK-NEXT: v_writelane_b32 v22, s15, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 54 -; CHECK-NEXT: v_writelane_b32 v23, s1, 55 -; CHECK-NEXT: v_writelane_b32 v23, s2, 56 -; CHECK-NEXT: v_writelane_b32 v23, s3, 57 +; CHECK-NEXT: v_writelane_b32 v22, s0, 54 +; CHECK-NEXT: v_writelane_b32 v22, s1, 55 +; CHECK-NEXT: v_writelane_b32 v22, s2, 56 +; CHECK-NEXT: v_writelane_b32 v22, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 58 -; CHECK-NEXT: v_writelane_b32 v23, s1, 59 -; CHECK-NEXT: v_writelane_b32 v23, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s3, 61 -; CHECK-NEXT: v_writelane_b32 v23, s4, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 0 -; CHECK-NEXT: v_writelane_b32 v23, s5, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 1 +; CHECK-NEXT: v_writelane_b32 v22, s0, 58 +; CHECK-NEXT: v_writelane_b32 v22, s1, 59 +; CHECK-NEXT: v_writelane_b32 v22, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v22, s3, 61 +; CHECK-NEXT: v_writelane_b32 v22, s4, 62 +; CHECK-NEXT: v_writelane_b32 v23, s6, 0 +; CHECK-NEXT: v_writelane_b32 v22, s5, 63 +; CHECK-NEXT: v_writelane_b32 v23, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 2 -; CHECK-NEXT: v_writelane_b32 v0, s1, 3 -; CHECK-NEXT: v_writelane_b32 v0, s2, 4 -; CHECK-NEXT: v_writelane_b32 v0, s3, 5 -; CHECK-NEXT: v_writelane_b32 v0, s4, 6 -; CHECK-NEXT: v_writelane_b32 v0, s5, 7 -; CHECK-NEXT: v_writelane_b32 v0, s6, 8 -; CHECK-NEXT: v_writelane_b32 v0, s7, 9 -; CHECK-NEXT: v_writelane_b32 v0, s8, 10 -; CHECK-NEXT: v_writelane_b32 v0, s9, 11 -; CHECK-NEXT: v_writelane_b32 v0, s10, 12 -; CHECK-NEXT: v_writelane_b32 v0, s11, 13 -; CHECK-NEXT: v_writelane_b32 v0, s12, 14 -; CHECK-NEXT: v_writelane_b32 v0, s13, 15 -; CHECK-NEXT: v_writelane_b32 v0, s14, 16 -; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: v_writelane_b32 v23, s0, 2 +; CHECK-NEXT: v_writelane_b32 v23, s1, 3 +; CHECK-NEXT: v_writelane_b32 v23, s2, 4 +; CHECK-NEXT: v_writelane_b32 v23, s3, 5 +; CHECK-NEXT: v_writelane_b32 v23, s4, 6 +; CHECK-NEXT: v_writelane_b32 v23, s5, 7 +; CHECK-NEXT: v_writelane_b32 v23, s6, 8 +; CHECK-NEXT: v_writelane_b32 v23, s7, 9 +; CHECK-NEXT: v_writelane_b32 v23, s8, 10 +; CHECK-NEXT: v_writelane_b32 v23, s9, 11 +; CHECK-NEXT: v_writelane_b32 v23, s10, 12 +; CHECK-NEXT: v_writelane_b32 v23, s11, 13 +; CHECK-NEXT: v_writelane_b32 v23, s12, 14 +; CHECK-NEXT: v_writelane_b32 v23, s13, 15 +; CHECK-NEXT: v_writelane_b32 v23, s14, 16 +; CHECK-NEXT: v_writelane_b32 v23, s15, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 18 -; CHECK-NEXT: v_writelane_b32 v0, s1, 19 +; CHECK-NEXT: v_writelane_b32 v23, s0, 18 +; CHECK-NEXT: v_writelane_b32 v23, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 20 -; CHECK-NEXT: v_writelane_b32 v0, s1, 21 -; CHECK-NEXT: v_writelane_b32 v0, s2, 22 -; CHECK-NEXT: v_writelane_b32 v0, s3, 23 +; CHECK-NEXT: v_writelane_b32 v23, s0, 20 +; CHECK-NEXT: v_writelane_b32 v23, s1, 21 +; CHECK-NEXT: v_writelane_b32 v23, s2, 22 +; CHECK-NEXT: v_writelane_b32 v23, s3, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 24 -; CHECK-NEXT: v_writelane_b32 v0, s1, 25 -; CHECK-NEXT: v_writelane_b32 v0, s2, 26 -; CHECK-NEXT: v_writelane_b32 v0, s3, 27 -; CHECK-NEXT: v_writelane_b32 v0, s4, 28 -; CHECK-NEXT: v_writelane_b32 v0, s5, 29 -; CHECK-NEXT: v_writelane_b32 v0, s6, 30 -; CHECK-NEXT: v_writelane_b32 v0, s7, 31 +; CHECK-NEXT: v_writelane_b32 v23, s0, 24 +; CHECK-NEXT: v_writelane_b32 v23, s1, 25 +; CHECK-NEXT: v_writelane_b32 v23, s2, 26 +; CHECK-NEXT: v_writelane_b32 v23, s3, 27 +; CHECK-NEXT: v_writelane_b32 v23, s4, 28 +; CHECK-NEXT: v_writelane_b32 v23, s5, 29 +; CHECK-NEXT: v_writelane_b32 v23, s6, 30 +; CHECK-NEXT: v_writelane_b32 v23, s7, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 32 -; CHECK-NEXT: v_writelane_b32 v0, s1, 33 -; CHECK-NEXT: v_writelane_b32 v0, s2, 34 -; CHECK-NEXT: v_writelane_b32 v0, s3, 35 -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 -; CHECK-NEXT: v_writelane_b32 v0, s12, 44 -; CHECK-NEXT: v_writelane_b32 v0, s13, 45 -; CHECK-NEXT: v_writelane_b32 v0, s14, 46 -; CHECK-NEXT: v_writelane_b32 v0, s15, 47 +; CHECK-NEXT: v_writelane_b32 v23, s0, 32 +; CHECK-NEXT: v_writelane_b32 v23, s1, 33 +; CHECK-NEXT: v_writelane_b32 v23, s2, 34 +; CHECK-NEXT: v_writelane_b32 v23, s3, 35 +; CHECK-NEXT: v_writelane_b32 v23, s4, 36 +; CHECK-NEXT: v_writelane_b32 v23, s5, 37 +; CHECK-NEXT: v_writelane_b32 v23, s6, 38 +; CHECK-NEXT: v_writelane_b32 v23, s7, 39 +; CHECK-NEXT: v_writelane_b32 v23, s8, 40 +; CHECK-NEXT: v_writelane_b32 v23, s9, 41 +; CHECK-NEXT: v_writelane_b32 v23, s10, 42 +; CHECK-NEXT: v_writelane_b32 v23, s11, 43 +; CHECK-NEXT: v_writelane_b32 v23, s12, 44 +; CHECK-NEXT: v_writelane_b32 v23, s13, 45 +; CHECK-NEXT: v_writelane_b32 v23, s14, 46 +; CHECK-NEXT: v_writelane_b32 v23, s15, 47 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret ; CHECK-NEXT: s_endpgm @@ -202,51 +202,51 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: v_readlane_b32 s0, v22, 2 +; CHECK-NEXT: v_readlane_b32 s1, v22, 3 +; CHECK-NEXT: v_readlane_b32 s2, v22, 4 +; CHECK-NEXT: v_readlane_b32 s3, v22, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 6 -; CHECK-NEXT: v_readlane_b32 s1, v23, 7 -; CHECK-NEXT: v_readlane_b32 s2, v23, 8 -; CHECK-NEXT: v_readlane_b32 s3, v23, 9 -; CHECK-NEXT: v_readlane_b32 s4, v23, 10 -; CHECK-NEXT: v_readlane_b32 s5, v23, 11 -; CHECK-NEXT: v_readlane_b32 s6, v23, 12 -; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: v_readlane_b32 s0, v22, 6 +; CHECK-NEXT: v_readlane_b32 s1, v22, 7 +; CHECK-NEXT: v_readlane_b32 s2, v22, 8 +; CHECK-NEXT: v_readlane_b32 s3, v22, 9 +; CHECK-NEXT: v_readlane_b32 s4, v22, 10 +; CHECK-NEXT: v_readlane_b32 s5, v22, 11 +; CHECK-NEXT: v_readlane_b32 s6, v22, 12 +; CHECK-NEXT: v_readlane_b32 s7, v22, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 14 -; CHECK-NEXT: v_readlane_b32 s1, v23, 15 -; CHECK-NEXT: v_readlane_b32 s2, v23, 16 -; CHECK-NEXT: v_readlane_b32 s3, v23, 17 -; CHECK-NEXT: v_readlane_b32 s4, v23, 18 -; CHECK-NEXT: v_readlane_b32 s5, v23, 19 -; CHECK-NEXT: v_readlane_b32 s6, v23, 20 -; CHECK-NEXT: v_readlane_b32 s7, v23, 21 -; CHECK-NEXT: v_readlane_b32 s8, v23, 22 -; CHECK-NEXT: v_readlane_b32 s9, v23, 23 -; CHECK-NEXT: v_readlane_b32 s10, v23, 24 -; CHECK-NEXT: v_readlane_b32 s11, v23, 25 -; CHECK-NEXT: v_readlane_b32 s12, v23, 26 -; CHECK-NEXT: v_readlane_b32 s13, v23, 27 -; CHECK-NEXT: v_readlane_b32 s14, v23, 28 -; CHECK-NEXT: v_readlane_b32 s15, v23, 29 +; CHECK-NEXT: v_readlane_b32 s0, v22, 14 +; CHECK-NEXT: v_readlane_b32 s1, v22, 15 +; CHECK-NEXT: v_readlane_b32 s2, v22, 16 +; CHECK-NEXT: v_readlane_b32 s3, v22, 17 +; CHECK-NEXT: v_readlane_b32 s4, v22, 18 +; CHECK-NEXT: v_readlane_b32 s5, v22, 19 +; CHECK-NEXT: v_readlane_b32 s6, v22, 20 +; CHECK-NEXT: v_readlane_b32 s7, v22, 21 +; CHECK-NEXT: v_readlane_b32 s8, v22, 22 +; CHECK-NEXT: v_readlane_b32 s9, v22, 23 +; CHECK-NEXT: v_readlane_b32 s10, v22, 24 +; CHECK-NEXT: v_readlane_b32 s11, v22, 25 +; CHECK-NEXT: v_readlane_b32 s12, v22, 26 +; CHECK-NEXT: v_readlane_b32 s13, v22, 27 +; CHECK-NEXT: v_readlane_b32 s14, v22, 28 +; CHECK-NEXT: v_readlane_b32 s15, v22, 29 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 30 -; CHECK-NEXT: v_readlane_b32 s1, v23, 31 -; CHECK-NEXT: v_readlane_b32 s2, v23, 32 -; CHECK-NEXT: v_readlane_b32 s3, v23, 33 -; CHECK-NEXT: v_readlane_b32 s4, v23, 34 -; CHECK-NEXT: v_readlane_b32 s5, v23, 35 -; CHECK-NEXT: v_readlane_b32 s6, v23, 36 -; CHECK-NEXT: v_readlane_b32 s7, v23, 37 +; CHECK-NEXT: v_readlane_b32 s0, v22, 30 +; CHECK-NEXT: v_readlane_b32 s1, v22, 31 +; CHECK-NEXT: v_readlane_b32 s2, v22, 32 +; CHECK-NEXT: v_readlane_b32 s3, v22, 33 +; CHECK-NEXT: v_readlane_b32 s4, v22, 34 +; CHECK-NEXT: v_readlane_b32 s5, v22, 35 +; CHECK-NEXT: v_readlane_b32 s6, v22, 36 +; CHECK-NEXT: v_readlane_b32 s7, v22, 37 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[42:43] ; CHECK-NEXT: ;;#ASMEND @@ -256,10 +256,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 38 -; CHECK-NEXT: v_readlane_b32 s1, v23, 39 -; CHECK-NEXT: v_readlane_b32 s2, v23, 40 -; CHECK-NEXT: v_readlane_b32 s3, v23, 41 +; CHECK-NEXT: v_readlane_b32 s0, v22, 38 +; CHECK-NEXT: v_readlane_b32 s1, v22, 39 +; CHECK-NEXT: v_readlane_b32 s2, v22, 40 +; CHECK-NEXT: v_readlane_b32 s3, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND @@ -272,100 +272,100 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v23, 42 -; CHECK-NEXT: v_readlane_b32 s5, v23, 43 -; CHECK-NEXT: v_readlane_b32 s6, v23, 44 -; CHECK-NEXT: v_readlane_b32 s7, v23, 45 -; CHECK-NEXT: v_readlane_b32 s8, v23, 46 -; CHECK-NEXT: v_readlane_b32 s9, v23, 47 -; CHECK-NEXT: v_readlane_b32 s10, v23, 48 -; CHECK-NEXT: v_readlane_b32 s11, v23, 49 -; CHECK-NEXT: v_readlane_b32 s12, v23, 50 -; CHECK-NEXT: v_readlane_b32 s13, v23, 51 -; CHECK-NEXT: v_readlane_b32 s14, v23, 52 -; CHECK-NEXT: v_readlane_b32 s15, v23, 53 +; CHECK-NEXT: v_readlane_b32 s4, v22, 42 +; CHECK-NEXT: v_readlane_b32 s5, v22, 43 +; CHECK-NEXT: v_readlane_b32 s6, v22, 44 +; CHECK-NEXT: v_readlane_b32 s7, v22, 45 +; CHECK-NEXT: v_readlane_b32 s8, v22, 46 +; CHECK-NEXT: v_readlane_b32 s9, v22, 47 +; CHECK-NEXT: v_readlane_b32 s10, v22, 48 +; CHECK-NEXT: v_readlane_b32 s11, v22, 49 +; CHECK-NEXT: v_readlane_b32 s12, v22, 50 +; CHECK-NEXT: v_readlane_b32 s13, v22, 51 +; CHECK-NEXT: v_readlane_b32 s14, v22, 52 +; CHECK-NEXT: v_readlane_b32 s15, v22, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 54 -; CHECK-NEXT: v_readlane_b32 s1, v23, 55 -; CHECK-NEXT: v_readlane_b32 s2, v23, 56 -; CHECK-NEXT: v_readlane_b32 s3, v23, 57 +; CHECK-NEXT: v_readlane_b32 s0, v22, 54 +; CHECK-NEXT: v_readlane_b32 s1, v22, 55 +; CHECK-NEXT: v_readlane_b32 s2, v22, 56 +; CHECK-NEXT: v_readlane_b32 s3, v22, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 58 -; CHECK-NEXT: v_readlane_b32 s1, v23, 59 -; CHECK-NEXT: v_readlane_b32 s2, v23, 60 -; CHECK-NEXT: v_readlane_b32 s3, v23, 61 -; CHECK-NEXT: v_readlane_b32 s4, v23, 62 -; CHECK-NEXT: v_readlane_b32 s5, v23, 63 -; CHECK-NEXT: v_readlane_b32 s6, v0, 0 -; CHECK-NEXT: v_readlane_b32 s7, v0, 1 +; CHECK-NEXT: v_readlane_b32 s0, v22, 58 +; CHECK-NEXT: v_readlane_b32 s1, v22, 59 +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 +; CHECK-NEXT: v_readlane_b32 s4, v22, 62 +; CHECK-NEXT: v_readlane_b32 s5, v22, 63 +; CHECK-NEXT: v_readlane_b32 s6, v23, 0 +; CHECK-NEXT: v_readlane_b32 s7, v23, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 2 -; CHECK-NEXT: v_readlane_b32 s1, v0, 3 -; CHECK-NEXT: v_readlane_b32 s2, v0, 4 -; CHECK-NEXT: v_readlane_b32 s3, v0, 5 -; CHECK-NEXT: v_readlane_b32 s4, v0, 6 -; CHECK-NEXT: v_readlane_b32 s5, v0, 7 -; CHECK-NEXT: v_readlane_b32 s6, v0, 8 -; CHECK-NEXT: v_readlane_b32 s7, v0, 9 -; CHECK-NEXT: v_readlane_b32 s8, v0, 10 -; CHECK-NEXT: v_readlane_b32 s9, v0, 11 -; CHECK-NEXT: v_readlane_b32 s10, v0, 12 -; CHECK-NEXT: v_readlane_b32 s11, v0, 13 -; CHECK-NEXT: v_readlane_b32 s12, v0, 14 -; CHECK-NEXT: v_readlane_b32 s13, v0, 15 -; CHECK-NEXT: v_readlane_b32 s14, v0, 16 -; CHECK-NEXT: v_readlane_b32 s15, v0, 17 +; CHECK-NEXT: v_readlane_b32 s0, v23, 2 +; CHECK-NEXT: v_readlane_b32 s1, v23, 3 +; CHECK-NEXT: v_readlane_b32 s2, v23, 4 +; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: v_readlane_b32 s4, v23, 6 +; CHECK-NEXT: v_readlane_b32 s5, v23, 7 +; CHECK-NEXT: v_readlane_b32 s6, v23, 8 +; CHECK-NEXT: v_readlane_b32 s7, v23, 9 +; CHECK-NEXT: v_readlane_b32 s8, v23, 10 +; CHECK-NEXT: v_readlane_b32 s9, v23, 11 +; CHECK-NEXT: v_readlane_b32 s10, v23, 12 +; CHECK-NEXT: v_readlane_b32 s11, v23, 13 +; CHECK-NEXT: v_readlane_b32 s12, v23, 14 +; CHECK-NEXT: v_readlane_b32 s13, v23, 15 +; CHECK-NEXT: v_readlane_b32 s14, v23, 16 +; CHECK-NEXT: v_readlane_b32 s15, v23, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 18 -; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v23, 18 +; CHECK-NEXT: v_readlane_b32 s1, v23, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 20 -; CHECK-NEXT: v_readlane_b32 s1, v0, 21 -; CHECK-NEXT: v_readlane_b32 s2, v0, 22 -; CHECK-NEXT: v_readlane_b32 s3, v0, 23 +; CHECK-NEXT: v_readlane_b32 s0, v23, 20 +; CHECK-NEXT: v_readlane_b32 s1, v23, 21 +; CHECK-NEXT: v_readlane_b32 s2, v23, 22 +; CHECK-NEXT: v_readlane_b32 s3, v23, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 24 -; CHECK-NEXT: v_readlane_b32 s1, v0, 25 -; CHECK-NEXT: v_readlane_b32 s2, v0, 26 -; CHECK-NEXT: v_readlane_b32 s3, v0, 27 -; CHECK-NEXT: v_readlane_b32 s4, v0, 28 -; CHECK-NEXT: v_readlane_b32 s5, v0, 29 -; CHECK-NEXT: v_readlane_b32 s6, v0, 30 -; CHECK-NEXT: v_readlane_b32 s7, v0, 31 +; CHECK-NEXT: v_readlane_b32 s0, v23, 24 +; CHECK-NEXT: v_readlane_b32 s1, v23, 25 +; CHECK-NEXT: v_readlane_b32 s2, v23, 26 +; CHECK-NEXT: v_readlane_b32 s3, v23, 27 +; CHECK-NEXT: v_readlane_b32 s4, v23, 28 +; CHECK-NEXT: v_readlane_b32 s5, v23, 29 +; CHECK-NEXT: v_readlane_b32 s6, v23, 30 +; CHECK-NEXT: v_readlane_b32 s7, v23, 31 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 32 -; CHECK-NEXT: v_readlane_b32 s1, v0, 33 -; CHECK-NEXT: v_readlane_b32 s2, v0, 34 -; CHECK-NEXT: v_readlane_b32 s3, v0, 35 -; CHECK-NEXT: v_readlane_b32 s4, v0, 36 -; CHECK-NEXT: v_readlane_b32 s5, v0, 37 -; CHECK-NEXT: v_readlane_b32 s6, v0, 38 -; CHECK-NEXT: v_readlane_b32 s7, v0, 39 -; CHECK-NEXT: v_readlane_b32 s8, v0, 40 -; CHECK-NEXT: v_readlane_b32 s9, v0, 41 -; CHECK-NEXT: v_readlane_b32 s10, v0, 42 -; CHECK-NEXT: v_readlane_b32 s11, v0, 43 -; CHECK-NEXT: v_readlane_b32 s12, v0, 44 -; CHECK-NEXT: v_readlane_b32 s13, v0, 45 -; CHECK-NEXT: v_readlane_b32 s14, v0, 46 -; CHECK-NEXT: v_readlane_b32 s15, v0, 47 +; CHECK-NEXT: v_readlane_b32 s0, v23, 32 +; CHECK-NEXT: v_readlane_b32 s1, v23, 33 +; CHECK-NEXT: v_readlane_b32 s2, v23, 34 +; CHECK-NEXT: v_readlane_b32 s3, v23, 35 +; CHECK-NEXT: v_readlane_b32 s4, v23, 36 +; CHECK-NEXT: v_readlane_b32 s5, v23, 37 +; CHECK-NEXT: v_readlane_b32 s6, v23, 38 +; CHECK-NEXT: v_readlane_b32 s7, v23, 39 +; CHECK-NEXT: v_readlane_b32 s8, v23, 40 +; CHECK-NEXT: v_readlane_b32 s9, v23, 41 +; CHECK-NEXT: v_readlane_b32 s10, v23, 42 +; CHECK-NEXT: v_readlane_b32 s11, v23, 43 +; CHECK-NEXT: v_readlane_b32 s12, v23, 44 +; CHECK-NEXT: v_readlane_b32 s13, v23, 45 +; CHECK-NEXT: v_readlane_b32 s14, v23, 46 +; CHECK-NEXT: v_readlane_b32 s15, v23, 47 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index b4a981f1db4ec..882356d994fc6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,12 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0 - ; GCN-NEXT: SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 From 0a739e9427fa342fe1333e1fb9c196b4d55afb8f Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 28 Oct 2024 15:39:17 -0700 Subject: [PATCH 11/13] This commit: (1) use checkForAllInstructions() to check AddrSpaceCast instructions instead of walking through all instructions (2) move constHasASCast() to AMDGPUInformationCache to make use the existing function getConstantAccess(). --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 66 ++++++++++----------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 6f64c1cdb8094..81890d1094bf1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -262,6 +262,18 @@ class AMDGPUInformationCache : public InformationCache { return !HasAperture && (Access & ADDR_SPACE_CAST); } + bool constHasASCastFromPrivate(const Constant *C, Function &Fn) { + SmallPtrSet Visited; + uint8_t Access = getConstantAccess(C, Visited); + + if (Access & ADDR_SPACE_CAST) + if (const auto *CE = dyn_cast(C)) + if (CE->getOperand(0)->getType()->getPointerAddressSpace() == + AMDGPUAS::PRIVATE_ADDRESS) + return true; + return false; + } + private: /// Used to determine if the Constant needs the queue pointer. DenseMap ConstantStatus; @@ -440,21 +452,31 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return; } - SmallPtrSet VisitedConsts; + { // FLAT_SCRATCH_INIT + auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) { + return static_cast(I).getSrcAddressSpace() != + AMDGPUAS::PRIVATE_ADDRESS; + }; - for (Instruction &I : instructions(F)) { - if (isa(I) && - cast(I).getSrcAddressSpace() == - AMDGPUAS::PRIVATE_ADDRESS) { + bool UsedAssumedInformation = false; + if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this, + {Instruction::AddrSpaceCast}, + UsedAssumedInformation)) { + // If there is AddrSpaceCast instruction that casts from PRIVATE_ADDRESS removeAssumedBits(FLAT_SCRATCH_INIT); return; } - // check for addrSpaceCast in constant expressions - for (const Use &U : I.operands()) { - if (const auto *C = dyn_cast(U)) { - if (constHasASCast(C, VisitedConsts)) { - removeAssumedBits(FLAT_SCRATCH_INIT); - return; + + auto &InfoCache = static_cast(A.getInfoCache()); + + for (Instruction &I : instructions(F)) { + // check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions + for (const Use &U : I.operands()) { + if (const auto *C = dyn_cast(U)) { + if (InfoCache.constHasASCastFromPrivate(C, *F)) { + removeAssumedBits(FLAT_SCRATCH_INIT); + return; + } } } } @@ -737,28 +759,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this, UsedAssumedInformation); } - - bool constHasASCast(const Constant *C, - SmallPtrSetImpl &Visited) { - if (!Visited.insert(C).second) - return false; - - if (const auto *CE = dyn_cast(C)) - if (CE->getOpcode() == Instruction::AddrSpaceCast && - CE->getOperand(0)->getType()->getPointerAddressSpace() == - AMDGPUAS::PRIVATE_ADDRESS) - return true; - - for (const Use &U : C->operands()) { - const auto *OpC = dyn_cast(U); - if (!OpC || !Visited.insert(OpC).second) - continue; - - if (constHasASCast(OpC, Visited)) - return true; - } - return false; - } }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, From 944dfc3abbc39ce592c57a415aee326499a7cc16 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 29 Oct 2024 18:00:41 -0700 Subject: [PATCH 12/13] Move code from initialize() to needFlatScratch() which is called in updateImpl(). --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 60 ++++++++++----------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 81890d1094bf1..0f8a3279ebbff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -451,36 +451,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { indicatePessimisticFixpoint(); return; } - - { // FLAT_SCRATCH_INIT - auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) { - return static_cast(I).getSrcAddressSpace() != - AMDGPUAS::PRIVATE_ADDRESS; - }; - - bool UsedAssumedInformation = false; - if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this, - {Instruction::AddrSpaceCast}, - UsedAssumedInformation)) { - // If there is AddrSpaceCast instruction that casts from PRIVATE_ADDRESS - removeAssumedBits(FLAT_SCRATCH_INIT); - return; - } - - auto &InfoCache = static_cast(A.getInfoCache()); - - for (Instruction &I : instructions(F)) { - // check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions - for (const Use &U : I.operands()) { - if (const auto *C = dyn_cast(U)) { - if (InfoCache.constHasASCastFromPrivate(C, *F)) { - removeAssumedBits(FLAT_SCRATCH_INIT); - return; - } - } - } - } - } } ChangeStatus updateImpl(Attributor &A) override { @@ -734,6 +704,34 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { bool needFlatScratchInit(Attributor &A) { assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set + // Check all AddrSpaceCast instructions. FlatScratchInit is needed if + // there is a cast from PRIVATE_ADDRESS. + auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) { + return static_cast(I).getSrcAddressSpace() != + AMDGPUAS::PRIVATE_ADDRESS; + }; + + bool UsedAssumedInformation = false; + if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this, + {Instruction::AddrSpaceCast}, + UsedAssumedInformation)) + return true; + + // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions + auto &InfoCache = static_cast(A.getInfoCache()); + + Function *F = getAssociatedFunction(); + for (Instruction &I : instructions(F)) { + for (const Use &U : I.operands()) { + if (const auto *C = dyn_cast(U)) { + if (InfoCache.constHasASCastFromPrivate(C, *F)) + return true; + } + } + } + + // Finally check callees. + // This is called on each callee; false means callee shouldn't have // no-flat-scratch-init. auto CheckForNoFlatScratchInit = [&](Instruction &I) { @@ -752,7 +750,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { Intrinsic::amdgcn_addrspacecast_nonnull; }; - bool UsedAssumedInformation = false; + UsedAssumedInformation = false; // If any callee is false (i.e. need FlatScratchInit), // checkForAllCallLikeInstructions returns false, in which case this // function returns true. From 6296be119584be71580d2791a39a9ec1b0a9aa20 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 31 Oct 2024 17:06:38 -0700 Subject: [PATCH 13/13] Some minor code changes. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 0f8a3279ebbff..d8aa721b7ab25 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -262,7 +262,7 @@ class AMDGPUInformationCache : public InformationCache { return !HasAperture && (Access & ADDR_SPACE_CAST); } - bool constHasASCastFromPrivate(const Constant *C, Function &Fn) { + bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) { SmallPtrSet Visited; uint8_t Access = getConstantAccess(C, Visited); @@ -706,8 +706,8 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // Check all AddrSpaceCast instructions. FlatScratchInit is needed if // there is a cast from PRIVATE_ADDRESS. - auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) { - return static_cast(I).getSrcAddressSpace() != + auto AddrSpaceCastNotFromPrivate = [](Instruction &I) { + return cast(I).getSrcAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS; }; @@ -724,7 +724,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { for (Instruction &I : instructions(F)) { for (const Use &U : I.operands()) { if (const auto *C = dyn_cast(U)) { - if (InfoCache.constHasASCastFromPrivate(C, *F)) + if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C)) return true; } }