diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index f26639847be75..1d2c07b4deea9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1431,6 +1431,14 @@ def FeatureDisable : SubtargetFeature<"", "Dummy feature to disable assembler instructions" >; +// GFX-9 & higher targets have a 16-dword Instruction Buffer and per-SQ +// instruction store which can supply 4 dwords to each of the 2 waves per +// cycle. Change default alignment to 4 dwords or 16 bytes. +def FeaturePrefLoopAlign32B : SubtargetFeature<"loop-align", + "PrefLoopAlignmentLog2", + "5", + "Prefer 32-byte alignment for loops">; + //===----------------------------------------------------------------------===// class GCNSubtargetFeatureGeneration ; @@ -1519,7 +1528,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeaturePrefLoopAlign32B ] >; @@ -1542,7 +1551,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeaturePrefLoopAlign32B ] >; @@ -1566,7 +1575,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32, - FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics + FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics, + FeaturePrefLoopAlign32B ] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 64e68ab7d753c..5124aa04550d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -628,6 +628,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setMaxAtomicSizeInBitsSupported(64); setMaxDivRemBitWidthSupported(64); setMaxLargeFPConvertBitWidthSupported(64); + setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopAlignment())); } bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 6878744496cfe..fff803b40c41e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -80,6 +80,7 @@ class AMDGPUSubtarget { unsigned LocalMemorySize = 0; unsigned AddressableLocalMemorySize = 0; char WavefrontSizeLog2 = 0; + unsigned PrefLoopAlignmentLog2 = 0; public: AMDGPUSubtarget(Triple TT); @@ -377,6 +378,8 @@ class AMDGPUSubtarget { uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; + unsigned getPrefLoopAlignment() const { return PrefLoopAlignmentLog2; } + /// \returns Corresponding DWARF register number mapping flavour for the /// \p WavefrontSize. AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e866bd47e267d..eddb95e2a6a90 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -53,9 +53,13 @@ using namespace llvm::SDPatternMatch; STATISTIC(NumTailCalls, "Number of tail calls"); +static cl::opt + DisableAllLoopAlignment("amdgpu-disable-all-loop-alignment", + cl::desc("Do not align loops"), cl::init(false)); + static cl::opt DisableLoopAlignment("amdgpu-disable-loop-alignment", - cl::desc("Do not align and prefetch loops"), + cl::desc("Do not align loops for prefetch"), cl::init(false)); static cl::opt UseDivergentRegisterIndexing( @@ -17434,25 +17438,9 @@ Align SITargetLowering::computeKnownAlignForTargetInstr( Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); const Align CacheLineAlign = Align(64); - - // Pre-GFX10 target did not benefit from loop alignment - if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || - getSubtarget()->hasInstFwdPrefetchBug()) - return PrefAlign; - - // On GFX10 I$ is 4 x 64 bytes cache lines. - // By default prefetcher keeps one cache line behind and reads two ahead. - // We can modify it with S_INST_PREFETCH for larger loops to have two lines - // behind and one ahead. - // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. - // If loop fits 64 bytes it always spans no more than two cache lines and - // does not need an alignment. - // Else if loop is less or equal 128 bytes we do not need to modify prefetch, - // Else if loop is less or equal 192 bytes we need two lines behind. - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const MachineBasicBlock *Header = ML->getHeader(); - if (Header->getAlignment() != PrefAlign) + if (DisableAllLoopAlignment || Header->getAlignment() > PrefAlign) return Header->getAlignment(); // Already processed. unsigned LoopSize = 0; @@ -17465,10 +17453,40 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { for (const MachineInstr &MI : *MBB) { LoopSize += TII->getInstSizeInBytes(MI); if (LoopSize > 192) - return PrefAlign; + break; } } + // Pre-GFX10 targets did not benefit from loop alignment driven by prefetch + // considerations + if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || + getSubtarget()->hasInstFwdPrefetchBug()) { + // Align loops < 32 bytes agrressively + if (LoopSize <= 32) + return PrefAlign; + // Align larger loops less aggressively + if (!ML->isInnermost()) + return Header->getAlignment(); + return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign; + } + + // On GFX10 I$ is 4 x 64 bytes cache lines. + // By default prefetcher keeps one cache line behind and reads two ahead. + // We can modify it with S_INST_PREFETCH for larger loops to have two lines + // behind and one ahead. + // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. + // If loop fits 64 bytes it always spans no more than two cache lines and + // does not need an alignment. + // Else if loop is less or equal 128 bytes we do not need to modify prefetch, + // Else if loop is less or equal 192 bytes we need two lines behind. + + // Align larger loops less aggressively + if (LoopSize > 192) { + if (!ML->isInnermost()) + return Header->getAlignment(); + return (PrefAlign.value() > 1) ? Align(PrefAlign.value() >> 1) : PrefAlign; + } + if (LoopSize <= 64) return PrefAlign; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 666523c88860c..f5467807d2e98 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 351502816ae6e..22a5353a6960d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index ff26ea21390e2..0c81ad1bccae9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s ; Divergent phis that don't require lowering using lane mask merging diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index a8a75cd2ffaa8..59ab1ccfc0250 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; This file contains various tests that have divergent i1s used outside of ; the loop. These are lane masks is sgpr and need to have correct value in diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index fd08ab88990ed..d939dbe5a42f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s ; Simples case, if - then, that requires lane mask merging, ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index d13d6a19d332a..29b69d299363c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s define void @temporal_divergent_i1_phi(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i1_phi: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index d4e5487828c48..061feb350cf19 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX10 %s define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 6148bc2d5ae6e..2bf8277dcbfac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s ; Make sure the branch targets are correct after lowering llvm.amdgcn.if diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 8a53c862371cf..12ec7513ea925 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-32 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10-64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10-64 %s define amdgpu_ps void @static_exact(float %arg0, float %arg1) { ; SI-LABEL: static_exact: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index 5240bf4f3a1d7..aa0dc465d8cc0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s ; if instruction is uniform and there is available instruction, select SALU instruction define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 9c2fabce4bcde..ff9eb55fc21bd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 3160e38df5e3f..136c6b6fe1189 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; This testcase would fail on GFX908 due to not having a free VGPR available to ; copy between AGPRs. diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index e03c9ca34b825..ff93f03405483 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10 declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.and.i32(i32, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 394727c88b0be..3c16fe609b546 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 4cc39d93854a0..f7c4a2b3bfd5a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,30 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-disable-all-loop-alignment=true -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-disable-all-loop-alignment=true -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_DPP,GFX1164_DPP-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_DPP,GFX1132_DPP-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_DPP,GFX1132_DPP-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -895,6 +895,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align 5 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] @@ -944,6 +945,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align 5 ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -2563,6 +2565,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 5 ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -2620,6 +2623,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 5 ; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -4430,6 +4434,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: .p2align 5 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] @@ -4479,6 +4484,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: .p2align 5 ; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -6129,6 +6135,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: .p2align 5 ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] @@ -6186,6 +6193,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: .p2align 5 ; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 @@ -7946,6 +7954,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-TRUE16-NEXT: .p2align 5 ; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8011,6 +8020,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-FAKE16-NEXT: .p2align 5 ; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8075,6 +8085,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-TRUE16-NEXT: .p2align 5 ; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8137,6 +8148,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-FAKE16-NEXT: .p2align 5 ; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8659,6 +8671,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .p2align 5 ; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -8701,6 +8714,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: .p2align 5 ; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9639,6 +9653,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-TRUE16-NEXT: .p2align 5 ; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9704,6 +9719,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-FAKE16-NEXT: .p2align 5 ; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9768,6 +9784,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-TRUE16-NEXT: .p2align 5 ; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9830,6 +9847,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-FAKE16-NEXT: .p2align 5 ; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10352,6 +10370,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .p2align 5 ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -10394,6 +10413,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: .p2align 5 ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10732,6 +10752,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-TRUE16-NEXT: .p2align 5 ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10779,6 +10800,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-FAKE16-NEXT: .p2align 5 ; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10826,6 +10848,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-TRUE16-NEXT: .p2align 5 ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10872,6 +10895,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-FAKE16-NEXT: .p2align 5 ; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11987,6 +12011,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: s_mov_b32 s4, s2 +; GFX1164-NEXT: .p2align 5 ; GFX1164-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -12022,6 +12047,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mov_b32_e32 v1, s4 ; GFX1132-NEXT: s_mov_b32 s4, s2 +; GFX1132-NEXT: .p2align 5 ; GFX1132-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 17737cccec7c4..6152564c774a8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-disable-all-loop-alignment=true -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-disable-all-loop-alignment=true -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -11454,7 +11454,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -11511,7 +11510,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -13280,7 +13278,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -13337,7 +13334,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -15096,7 +15092,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -15153,7 +15148,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 @@ -16910,7 +16904,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] @@ -16967,7 +16960,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index e4def28667ed4..32e54f4c8a802 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 39a3c9aade586..9fd97fe30ef3f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index e4323999d19c3..4dc3436883740 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-LABEL: syncscope_system: diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll index 2cd50b3b1b2a2..6721a3e67b351 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind { ; GCN-LABEL: atomic_nand_i32_lds: diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 5fc9f4a0f8038..9e4df650fd47a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN:llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s +; RUN:llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s define float @global_system_atomic_fadd_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250-LABEL: global_system_atomic_fadd_f32: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll index 722dff0e18a23..aaeaf3448a26d 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1030 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s ; For gfx1010, overestimate the branch size in case we need to insert ; a nop for the buggy offset. diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 243f0ed3a8d0d..b5e85059330a7 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; FIXME: We should use llvm-mc for this, but we can't even parse our own output. ; See PR33579. -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s ; OBJ: Relocations [ diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 5959f76492f3c..213f645137c8e 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s ; FIXME: We should use llvm-mc for this, but we can't even parse our own output. diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index f4b432dce8c8a..ea68a875ed43d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -2494,7 +2494,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 @@ -4550,7 +4549,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 @@ -4627,7 +4625,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 @@ -5268,7 +5265,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -5323,7 +5319,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5786,7 +5781,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -5840,7 +5834,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -6413,8 +6406,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 @@ -6468,7 +6459,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6503,8 +6493,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2 @@ -6557,7 +6545,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7569,7 +7556,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 @@ -9109,8 +9095,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9149,7 +9133,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9163,8 +9146,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9201,7 +9182,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9544,8 +9524,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9582,7 +9560,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9595,8 +9572,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9631,7 +9606,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10125,8 +10099,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB28_4 Depth 2 @@ -10181,7 +10153,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -10716,8 +10687,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10756,7 +10725,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10770,8 +10738,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10808,7 +10774,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11151,8 +11116,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11189,7 +11152,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11202,8 +11164,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11238,7 +11198,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11579,8 +11538,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11619,7 +11576,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11633,8 +11589,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11671,7 +11625,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12014,8 +11967,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12052,7 +12003,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12065,8 +12015,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12101,7 +12049,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12440,8 +12387,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12478,7 +12423,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12491,8 +12435,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12527,7 +12469,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 6f1675edbe58a..fdc11c0464c7d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -1707,7 +1707,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 @@ -2640,7 +2639,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2685,7 +2683,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3098,7 +3095,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3142,7 +3138,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3666,7 +3661,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 @@ -3745,7 +3739,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 @@ -4396,7 +4389,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4451,7 +4443,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -4916,7 +4907,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4970,7 +4960,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5545,8 +5534,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 @@ -5600,7 +5587,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -5635,8 +5621,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 @@ -5689,7 +5673,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -6932,7 +6915,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 @@ -7559,8 +7541,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -7599,7 +7579,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7613,8 +7592,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -7651,7 +7628,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8084,8 +8060,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -8122,7 +8096,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8135,8 +8108,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -8171,7 +8142,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8815,8 +8785,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 @@ -8871,7 +8839,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index acb27be1846b9..aab04b5dc578d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -1707,7 +1707,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 @@ -2640,7 +2639,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2685,7 +2683,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3098,7 +3095,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3142,7 +3138,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3666,7 +3661,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 @@ -3745,7 +3739,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 @@ -4396,7 +4389,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4451,7 +4443,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -4916,7 +4907,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4970,7 +4960,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5545,8 +5534,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 @@ -5600,7 +5587,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -5635,8 +5621,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 @@ -5689,7 +5673,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -6932,7 +6915,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 @@ -7559,8 +7541,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -7599,7 +7579,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7613,8 +7592,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -7651,7 +7628,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8084,8 +8060,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -8122,7 +8096,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8135,8 +8108,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -8171,7 +8142,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8815,8 +8785,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 @@ -8871,7 +8839,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 3c991cfb7a1aa..a81dbcc8f9280 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s ; Note: if you're adding tests here, also add them to ; lower-buffer-fat-pointers-mem-transfer.ll to verify the IR produced by diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 07816f1ed6a65..f03d8b5cf50b8 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=ISA -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s | FileCheck %s -check-prefix=MIR +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=ISA +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s | FileCheck %s -check-prefix=MIR define void @f(i32 %arg, ptr %ptr) { ; ISA-LABEL: f: diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll index c7f7f30a5e6bd..d92a632b7fcdf 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX689,GFX67,GFX6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX67,GFX7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck %s -check-prefixes=GFX6789,GFX678,GFX689,GFX89 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX6789,GFX689,GFX89,GFX9 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX6789,GFX689,GFX89,GFX9 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12 define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i32 inreg %val) { ; GFX67-LABEL: test_sink_smem_offset_400: diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 31c23b94a8de8..34c2272e7d237 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn-- -amdgpu-codegenprepare -S < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM -; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM ; Tests that we can avoid nullptr checks for addrspacecasts from/to priv/local. ; diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index 2558da401f89a..b00ebd9e81f2a 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX1100 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX1100 %s ; Test that unused lanes in the s_xor result are masked out with v_cndmask. diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll index a13f3513c660e..fd9c3b35880e8 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true < %s -mtriple=amdgcn -mcpu=gfx90a | FileCheck %s define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: copy_to_reg_frameindex: diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 9a98a7cd01ed4..42e88b86064ea 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None -o - %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None -o - %s | FileCheck %s %S = type <{ float, double }> diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 747affa928601..79bbcfbce2325 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s -; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_sdiv_i128_vv: diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 8c3d20ffb02fd..503ea8825aaef 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -4,7 +4,7 @@ ; checks are looking for the absence of specific metadata, which ; cannot be expressed reliably by the generated checks. -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=ISA +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=ISA ; RUN: opt --amdgpu-annotate-uniform -S %s | FileCheck %s -check-prefix=UNIFORM ; RUN: opt --amdgpu-annotate-uniform --si-annotate-control-flow -S %s | FileCheck %s -check-prefix=CONTROLFLOW diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 72913d2596ebf..77f1045090dfe 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5) { ; CHECK-LABEL: cannot_create_empty_or_backwards_segment: diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll index fd64ea3ae1c4b..c5bf5f4b6ab86 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s define i32 @rocrand_regression(ptr addrspace(1) %arg, i32 %arg0, i1 %cmp7) { ; CHECK-LABEL: rocrand_regression: diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll index d03d53a8cbbaa..f43b43051caec 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s ; SGPR phi ends up with VGPR inputs. Make sure we do not try to ; process a copy which has already been erased (which was already diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 8c7d5cffe39d9..03d46bf266530 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s @@ -12133,7 +12133,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12185,7 +12184,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12592,7 +12590,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12646,7 +12643,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13060,7 +13056,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13114,7 +13109,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13522,7 +13516,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13574,7 +13567,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13975,7 +13967,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14027,7 +14018,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14389,7 +14379,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14431,7 +14420,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14757,7 +14745,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -14797,7 +14784,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15154,7 +15140,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15204,7 +15189,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15605,7 +15589,6 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15659,7 +15642,6 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16071,7 +16053,6 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16123,7 +16104,6 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18669,8 +18649,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18707,7 +18685,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB68_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -18719,8 +18696,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18756,7 +18731,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -18995,8 +18969,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19033,7 +19005,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB69_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19045,8 +19016,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19082,7 +19051,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB69_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -19333,8 +19301,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19370,7 +19336,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19387,8 +19352,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19424,7 +19387,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19671,8 +19633,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19708,7 +19668,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB71_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19719,8 +19678,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19755,7 +19712,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB71_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19987,8 +19943,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20024,7 +19978,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20035,8 +19988,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20071,7 +20022,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20318,8 +20268,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20355,7 +20303,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB73_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20371,8 +20318,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20407,7 +20352,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB73_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20655,8 +20599,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20693,7 +20635,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -20705,8 +20646,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20742,7 +20681,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB74_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -20987,8 +20925,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21024,7 +20960,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB75_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21035,8 +20970,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21071,7 +21004,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB75_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21312,8 +21244,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21350,7 +21280,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -21362,8 +21291,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21399,7 +21326,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -21638,8 +21564,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21675,7 +21599,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB77_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21686,8 +21609,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21722,7 +21643,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB77_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21954,8 +21874,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21992,7 +21910,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -22004,8 +21921,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -22041,7 +21956,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -22280,8 +22194,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -22317,7 +22229,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22328,8 +22239,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -22364,7 +22273,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB79_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 56ad91dd59ffb..688b59d6a1f6a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s @@ -9995,7 +9995,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10047,7 +10046,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10455,7 +10453,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10509,7 +10506,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10924,7 +10920,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10978,7 +10973,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11380,7 +11374,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11430,7 +11423,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11825,7 +11817,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11877,7 +11868,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12279,7 +12269,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12331,7 +12320,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12694,7 +12682,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12736,7 +12723,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13063,7 +13049,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -13103,7 +13088,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13475,7 +13459,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13529,7 +13512,6 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13942,7 +13924,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13994,7 +13975,6 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16317,8 +16297,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16355,7 +16333,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16367,8 +16344,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16404,7 +16379,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16768,8 +16742,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16806,7 +16778,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16818,8 +16789,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16855,7 +16824,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17235,8 +17203,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17272,7 +17238,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17289,8 +17254,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17326,7 +17289,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17694,8 +17656,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17731,7 +17691,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17742,8 +17701,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17778,7 +17735,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18131,8 +18087,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18168,7 +18122,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18179,8 +18132,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18215,7 +18166,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18586,8 +18536,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18623,7 +18571,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18639,8 +18586,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18675,7 +18620,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19049,8 +18993,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19087,7 +19029,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19099,8 +19040,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19136,7 +19075,6 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -19503,8 +19441,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19540,7 +19476,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19551,8 +19486,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19587,7 +19520,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index f0083bd23660a..fc22844987838 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s @@ -9995,7 +9995,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10047,7 +10046,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10455,7 +10453,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10509,7 +10506,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10924,7 +10920,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10978,7 +10973,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11380,7 +11374,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11430,7 +11423,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11825,7 +11817,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11877,7 +11868,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12279,7 +12269,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12331,7 +12320,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12694,7 +12682,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12736,7 +12723,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13063,7 +13049,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -13103,7 +13088,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13475,7 +13459,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13529,7 +13512,6 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13942,7 +13924,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13994,7 +13975,6 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16317,8 +16297,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16355,7 +16333,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16367,8 +16344,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16404,7 +16379,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16768,8 +16742,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16806,7 +16778,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16818,8 +16789,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16855,7 +16824,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17235,8 +17203,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17272,7 +17238,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17289,8 +17254,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17326,7 +17289,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17694,8 +17656,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17731,7 +17691,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17742,8 +17701,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17778,7 +17735,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18131,8 +18087,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18168,7 +18122,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18179,8 +18132,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18215,7 +18166,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18586,8 +18536,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18623,7 +18571,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18639,8 +18586,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18675,7 +18620,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19049,8 +18993,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19087,7 +19029,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19099,8 +19040,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19136,7 +19075,6 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -19503,8 +19441,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19540,7 +19476,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19551,8 +19486,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19587,7 +19520,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 3ee0bb2122abe..1ecf911155caf 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s @@ -9578,7 +9578,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9630,7 +9629,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10037,7 +10035,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10091,7 +10088,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10505,7 +10501,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10559,7 +10554,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10960,7 +10954,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11010,7 +11003,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11404,7 +11396,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11456,7 +11447,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11857,7 +11847,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11909,7 +11898,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12271,7 +12259,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12313,7 +12300,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12639,7 +12625,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -12679,7 +12664,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13050,7 +13034,6 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13104,7 +13087,6 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13516,7 +13498,6 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13568,7 +13549,6 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15746,8 +15726,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15784,7 +15762,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -15796,8 +15773,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15833,7 +15808,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16197,8 +16171,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16235,7 +16207,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16247,8 +16218,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16284,7 +16253,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16664,8 +16632,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16701,7 +16667,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16718,8 +16683,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16755,7 +16718,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17123,8 +17085,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17160,7 +17120,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17171,8 +17130,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17207,7 +17164,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17560,8 +17516,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17597,7 +17551,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17608,8 +17561,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17644,7 +17595,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18015,8 +17965,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18052,7 +18000,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18068,8 +18015,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18104,7 +18049,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18478,8 +18422,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18516,7 +18458,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -18528,8 +18469,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18565,7 +18504,6 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -18932,8 +18870,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18969,7 +18905,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18980,8 +18915,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19016,7 +18949,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 20795431b4cd8..5e25231dea07e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s ; Test using saddr addressing mode of flat_*load_* instructions. diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 1311560715ddd..ef34e45dcfa90 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s ; --------------------------------------------------------------------- ; atomicrmw xchg diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 23dfe2f70fa7e..0c59fd35516ba 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN1 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s ; --------------------------------------------------------------------- ; atomicrmw xchg diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index fe47461ebf956..68ba18b20433b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s ; --------------------------------------------------------------------- ; atomicrmw xchg diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll index 6ef89a4ccd485..074c2208be73a 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -disable-machine-sink=1 - < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1030 -disable-machine-sink=1 - < %s | FileCheck -check-prefix=GFX10 %s define float @fold_abs_in_branch(float %arg1, float %arg2) { ; GFX10-LABEL: fold_abs_in_branch: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 0cb2b0b7df3d2..3cb623c1ba898 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX942 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX1250 +; RUN: llc -amdgpu-disable-all-loop-alignment=true < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A +; RUN: llc -amdgpu-disable-all-loop-alignment=true < %s -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX942 +; RUN: llc -amdgpu-disable-all-loop-alignment=true < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX1250 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 1f74fbdc46e98..9d8c0d559a19a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -12479,7 +12479,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12531,7 +12530,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12988,7 +12986,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13042,7 +13039,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -13508,7 +13504,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13562,7 +13557,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -14015,7 +14009,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -14065,7 +14058,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -14508,7 +14500,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -14560,7 +14551,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -15011,7 +15001,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -15063,7 +15052,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -15475,7 +15463,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -15517,7 +15504,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -15882,7 +15868,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -15922,7 +15907,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16330,7 +16314,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16384,7 +16367,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16848,7 +16830,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16900,7 +16881,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -20415,8 +20395,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -20453,7 +20431,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -20465,8 +20442,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -20502,7 +20477,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -20794,8 +20768,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -20832,7 +20804,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -20844,8 +20815,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -20881,7 +20850,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB79_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -21175,8 +21143,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -21213,7 +21179,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB80_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -21225,8 +21190,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -21262,7 +21225,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -21560,8 +21522,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -21597,7 +21557,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB81_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21608,8 +21567,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -21644,7 +21601,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB81_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21927,8 +21883,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -21964,7 +21918,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21975,8 +21928,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -22011,7 +21962,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22297,8 +22247,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -22334,7 +22282,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB83_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22345,8 +22292,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -22381,7 +22326,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB83_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22676,8 +22620,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -22714,7 +22656,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB84_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -22726,8 +22667,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -22763,7 +22702,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB84_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -23060,8 +22998,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -23097,7 +23033,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB85_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23108,8 +23043,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -23144,7 +23077,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB85_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23432,8 +23364,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -23470,7 +23400,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -23482,8 +23411,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -23519,7 +23446,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -23811,8 +23737,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -23848,7 +23772,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB87_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23859,8 +23782,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -23895,7 +23816,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB87_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24178,8 +24098,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -24216,7 +24134,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -24228,8 +24145,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -24265,7 +24180,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB88_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -24557,8 +24471,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -24594,7 +24506,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB89_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24605,8 +24516,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -24641,7 +24550,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB89_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24924,8 +24832,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -24962,7 +24868,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB90_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -24974,8 +24879,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -25011,7 +24914,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB90_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -25303,8 +25205,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -25340,7 +25240,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB91_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25351,8 +25250,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -25387,7 +25284,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB91_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index faa74fef2be2f..7af43300fb852 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -8900,7 +8900,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -8952,7 +8951,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9411,7 +9409,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9465,7 +9462,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9933,7 +9929,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9987,7 +9982,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10442,7 +10436,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10492,7 +10485,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10937,7 +10929,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10989,7 +10980,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11442,7 +11432,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11494,7 +11483,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11908,7 +11896,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11950,7 +11937,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12317,7 +12303,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -12357,7 +12342,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12767,7 +12751,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12821,7 +12804,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -13287,7 +13269,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13339,7 +13320,6 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16121,8 +16101,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16159,7 +16137,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16171,8 +16148,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16208,7 +16183,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16625,8 +16599,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16663,7 +16635,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16675,8 +16646,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16712,7 +16681,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17131,8 +17099,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -17169,7 +17135,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -17181,8 +17146,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -17218,7 +17181,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17637,8 +17599,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -17674,7 +17634,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17685,8 +17644,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -17721,7 +17678,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18125,8 +18081,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -18162,7 +18116,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18173,8 +18126,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -18209,7 +18160,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18616,8 +18566,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -18653,7 +18601,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18664,8 +18611,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -18700,7 +18645,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19121,8 +19065,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -19159,7 +19101,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19171,8 +19112,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -19208,7 +19147,6 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -19627,8 +19565,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -19664,7 +19600,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19675,8 +19610,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -19711,7 +19644,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index a46b0129b79e6..f19559a168178 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -8900,7 +8900,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -8952,7 +8951,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9411,7 +9409,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9465,7 +9462,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9933,7 +9929,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9987,7 +9982,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10442,7 +10436,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10492,7 +10485,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10937,7 +10929,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10989,7 +10980,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11442,7 +11432,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11494,7 +11483,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11908,7 +11896,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11950,7 +11937,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12317,7 +12303,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -12357,7 +12342,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12767,7 +12751,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12821,7 +12804,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -13287,7 +13269,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13339,7 +13320,6 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16121,8 +16101,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16159,7 +16137,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16171,8 +16148,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16208,7 +16183,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16625,8 +16599,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16663,7 +16635,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16675,8 +16646,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16712,7 +16681,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17131,8 +17099,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -17169,7 +17135,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -17181,8 +17146,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -17218,7 +17181,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17637,8 +17599,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -17674,7 +17634,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17685,8 +17644,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -17721,7 +17678,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18125,8 +18081,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -18162,7 +18116,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18173,8 +18126,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -18209,7 +18160,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18616,8 +18566,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -18653,7 +18601,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18664,8 +18611,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -18700,7 +18645,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19121,8 +19065,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -19159,7 +19101,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19171,8 +19112,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -19208,7 +19147,6 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -19627,8 +19565,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -19664,7 +19600,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19675,8 +19610,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -19711,7 +19644,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 053efdcb76261..2e9e1d84a6fc4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -9425,7 +9425,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9477,7 +9476,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9934,7 +9932,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9988,7 +9985,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10454,7 +10450,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10508,7 +10503,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -10961,7 +10955,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11011,7 +11004,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11454,7 +11446,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11506,7 +11497,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11957,7 +11947,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12009,7 +11998,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12421,7 +12409,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12463,7 +12450,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12828,7 +12814,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 @@ -12868,7 +12853,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -13276,7 +13260,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13330,7 +13313,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -13794,7 +13776,6 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13846,7 +13827,6 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16482,8 +16462,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -16520,7 +16498,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16532,8 +16509,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -16569,7 +16544,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -16986,8 +16960,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -17024,7 +16996,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -17036,8 +17007,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -17073,7 +17042,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17492,8 +17460,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -17530,7 +17496,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -17542,8 +17507,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -17579,7 +17542,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -17998,8 +17960,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -18035,7 +17995,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18046,8 +18005,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -18082,7 +18039,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18486,8 +18442,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -18523,7 +18477,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18534,8 +18487,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -18570,7 +18521,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18977,8 +18927,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -19014,7 +18962,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19025,8 +18972,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -19061,7 +19006,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19482,8 +19426,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -19520,7 +19462,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19532,8 +19473,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -19569,7 +19508,6 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -19988,8 +19926,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -20025,7 +19961,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20036,8 +19971,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -20072,7 +20005,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index 6fe9e1d5561de..99cf5a6d3c134 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s ; The first load produces address in a VGPR which is used in address calculation ; of the second load (one inside the loop). The value is uniform and the inner diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index b7ee9f70f6014..33c8f1488c0ff 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s ; Test using saddr addressing mode of global_* flat atomic instructions. diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 1602e31d6147c..0d93c3d65df6f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s ; Test using saddr addressing mode of global_*load_* flat instructions. diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a867c6c1affb8..9fdf540d26465 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; --------------------------------------------------------------------- ; atomicrmw xchg diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index a7f16449f058e..f4e8791a88f55 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; --------------------------------------------------------------------- ; atomicrmw xchg diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d15861be..c682d38fcb675 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() declare double @div.double.value() diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb39e97f5..9e89268478a26 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() declare float @div.double.value() diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac00863cd17..46d6588a928c5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() declare float @div.double.value() diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6311143f57260..aa290caebdce4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() declare double @div.double.value() diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 56ceba258f471..100f373cec2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s define void @main(i1 %arg) #0 { ; CHECK-LABEL: main: diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 835818fb2fd15..229a501295ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -22,6 +22,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s8, s5, s4 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_not_b32 s10, s5 @@ -69,6 +70,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_add_i32 s8, s4, s5 ; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -185,6 +187,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s8, s5, s4 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_not_b32 s10, s5 @@ -230,6 +233,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_add_i32 s8, s4, s5 ; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_not_b32 s9, s5 @@ -343,6 +347,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s5, s6, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 @@ -387,6 +392,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_add_i32 s5, s5, s6 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 @@ -498,6 +504,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 @@ -537,6 +544,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4 @@ -632,6 +640,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s2 @@ -663,6 +672,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s2 @@ -745,6 +755,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s3 @@ -778,6 +789,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s3 @@ -865,6 +877,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 @@ -902,6 +915,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 @@ -996,6 +1010,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: .p2align 4 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -1035,6 +1050,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: .p2align 4 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll index f582f984a3924..43851291f800b 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s ; iglp.opt should not be flagged as clobbering the memory operand for the global_load, and we should be able to ; lower into the scalar version (i.e. should not need to lower into vector version with waterfall loop) diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 0a493e5188ad5..61c99e149142b 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eb5c5ef15ed56..3a76691560ab6 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -172,6 +172,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: ; GFX11-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-NEXT: .p2align 5 ; GFX11-NEXT: .LBB2_10: ; %bb17 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index fb075221706dd..f08fa6ff2b634 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -stop-after=si-insert-waitcnts < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -stop-after=si-insert-waitcnts < %s | FileCheck %s declare fastcc void @bar() diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index a3b0a7768ca67..e7293332d7dce 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 ; from atomicrmw-expand.ll ; covers flat_load, flat_atomic (atomic with return) diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 300124848c1aa..92a25cb580e20 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-LABEL: issue130120: diff --git a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll index f18a657b8082d..80b4365fb42d3 100644 --- a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll +++ b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s ; Check for correct folding of the constants produced by the ; stepvector into the fadd. The value should not get lost when folding diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1ab4cb0f00192..443efd72b9b66 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GISEL12 %s -; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=DAGISEL12 %s -; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=GISEL10 %s -; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=DAGISEL10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GISEL12 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=DAGISEL12 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=GISEL10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=DAGISEL10 %s define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) { ; GISEL12-LABEL: basic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll index 0a2e7afa3d417..ddbeab62a000f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-12,GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-12,GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-12,GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-12,GFX12 %s define amdgpu_gs void @gs_const() { ; SI-LABEL: gs_const: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll index dbe95a8091932..43df6fe422888 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,SDAG,GFX9-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,SDAG,GFX10-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,SDAG,GFX9-SDAG +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,SDAG,GFX10-SDAG +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL declare void @foo(i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index a2c1545743039..c2deecc382b87 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; GFX11-LABEL: raw_atomic_buffer_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 6f7c001e03e26..a71ef5c0a2c92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) { ; GFX11-LABEL: raw_ptr_atomic_buffer_ptr_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 88963643218a5..4c2312d4cf0f4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) { ; GFX11-LABEL: struct_atomic_buffer_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 23db2479f66bb..d0a27502ef12e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) { ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 7d3b316915923..489d1e43cd6b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-64 %s define amdgpu_ps void @static_exact(float %arg0, float %arg1) { ; SI-LABEL: static_exact: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index a42c71c4849bd..2348e269a21e7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -3525,7 +3525,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3572,7 +3571,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3986,7 +3984,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4035,7 +4032,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4446,7 +4442,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4492,7 +4487,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4892,7 +4886,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4939,7 +4932,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5313,7 +5305,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5351,7 +5342,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5684,7 +5674,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5720,7 +5709,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6842,8 +6830,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6879,7 +6865,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6891,8 +6876,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6926,7 +6909,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7199,8 +7181,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7236,7 +7216,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -7248,8 +7227,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7283,7 +7260,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7557,8 +7533,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7592,7 +7566,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7603,8 +7576,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7637,7 +7608,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7901,8 +7871,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7936,7 +7904,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7947,8 +7914,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7981,7 +7946,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 8351d28057564..bcc2ff2e60331 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -3131,7 +3131,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3178,7 +3177,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3594,7 +3592,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3643,7 +3640,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4056,7 +4052,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4102,7 +4097,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4504,7 +4498,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4551,7 +4544,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4927,7 +4919,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4965,7 +4956,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5300,7 +5290,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5336,7 +5325,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,8 +6763,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6812,7 +6798,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6824,8 +6809,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6859,7 +6842,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7257,8 +7239,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7294,7 +7274,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -7306,8 +7285,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7341,7 +7318,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7734,8 +7710,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7769,7 +7743,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,8 +7753,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7814,7 +7785,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8197,8 +8167,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8232,7 +8200,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8243,8 +8210,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8277,7 +8242,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 0c4aca88b3781..9ed0b14f722ab 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -3131,7 +3131,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3178,7 +3177,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3594,7 +3592,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -3643,7 +3640,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4056,7 +4052,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4102,7 +4097,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4504,7 +4498,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4551,7 +4544,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4927,7 +4919,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4965,7 +4956,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5300,7 +5290,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5336,7 +5325,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,8 +6763,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6812,7 +6798,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6824,8 +6809,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6859,7 +6842,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7257,8 +7239,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7294,7 +7274,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -7306,8 +7285,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7341,7 +7318,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -7734,8 +7710,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7769,7 +7743,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7780,8 +7753,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7814,7 +7785,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8197,8 +8167,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8232,7 +8200,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8243,8 +8210,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8277,7 +8242,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 37310b614c0db..7c125fe112238 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-disable-all-loop-alignment=true -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 -amdgpu-disable-all-loop-alignment=true < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s @@ -3987,7 +3987,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4034,7 +4033,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4448,7 +4446,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4497,7 +4494,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4908,7 +4904,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -4954,7 +4949,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5354,7 +5348,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5401,7 +5394,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5775,7 +5767,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -5813,7 +5804,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6146,7 +6136,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -6182,7 +6171,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7547,8 +7535,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7584,7 +7570,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -7596,8 +7581,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7631,7 +7614,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -8029,8 +8011,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8066,7 +8046,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -8078,8 +8057,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8113,7 +8090,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -8506,8 +8482,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8541,7 +8515,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8552,8 +8525,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8586,7 +8557,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8969,8 +8939,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -9004,7 +8972,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9015,8 +8982,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -9049,7 +9014,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 5f0ca7bc42ae0..a33b46ac5a2a0 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefix=MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=FLATSCR %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefix=MUBUF %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=FLATSCR %s ; Make sure we use the correct frame offset is used with the local ; frame area. diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll index 0ce3742bb0e83..d971c46b484d0 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -amdgpu-enable-rewrite-partial-reg-uses=false < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -amdgpu-enable-rewrite-partial-reg-uses=false < %s | FileCheck %s ; This example used to produce a verifier error resulting from the ; register coalescer leaving behind a false live interval when a live diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll index 2445925b89bef..05f8c7abb3b4c 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s define void @loop_on_argument(i1 %arg) { ; IR-LABEL: @loop_on_argument( diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 702a69f776de3..3786975cee4c5 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { ; GFX12-LABEL: copy_flat: diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll index 595a78ca0c08c..308b1c96d7585 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll @@ -1,15 +1,20 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 --symbolize-operands - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s + +; GFX9-LABEL: test_loop_64 +; GFX9: .p2align 5 ; GFX8-NOT: s_inst_prefetch ; GFX8-NOT: .palign 6 ; GCN-LABEL: test_loop_64 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 -; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: p2align 5 +; GFX10-DIS-NEXT: s_nop 0 ; GFX10-ASM-NEXT: [[L1:.LBB[0-9_]+]]: -; GFX10-DIS-NEXT: <[[L1:L[0-9]+]]>: +; GFX10-DIS: <[[L1:L[0-9]+]]>: ; GFX10: s_sleep 0 ; GFX10: s_cbranch_scc0 [[L1]] ; GFX10-NEXT: s_endpgm @@ -28,6 +33,9 @@ bb2: ; preds = %bb2, %bb br i1 %tmp3, label %bb1, label %bb2 } +; GFX9-LABEL: test_loop_128 +; GFX9: .p2align 4 + ; GCN-LABEL: test_loop_128 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 ; GFX10-ASM-NEXT: .p2align 6 @@ -68,6 +76,9 @@ bb2: ; preds = %bb2, %bb br i1 %tmp3, label %bb1, label %bb2 } +; GFX9-LABEL: test_loop_192 +; GFX9: .p2align 4 + ; GCN-LABEL: test_loop_192 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 ; GFX10-NEXT: s_inst_prefetch 0x1 @@ -128,11 +139,15 @@ bb2: ; preds = %bb2, %bb br i1 %tmp3, label %bb1, label %bb2 } +; GFX9-LABEL: test_loop_256 +; GFX9: .p2align 4 + ; GCN-LABEL: test_loop_256 ; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 -; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: p2align 4 +; GFX10-DIS-NEXT: s_nop 0 ; GFX10-ASM-NEXT: [[L1:.LBB[0-9_]+]]: -; GFX10-DIS-NEXT: <[[L1:L[0-9]+]]>: +; GFX10-DIS: <[[L1:L[0-9]+]]>: ; GFX10: s_sleep 0 ; GFX10: s_cbranch_scc0 [[L1]] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir index 05cfe53224582..3262eccda5694 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir +++ b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir @@ -26,7 +26,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: bb.5: + ; GFX10-NEXT: bb.5 (align 32): ; GFX10-NEXT: successors: %bb.1(0x04000000), %bb.5(0x7c000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: S_NOP 0 @@ -52,7 +52,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.1: + ; GFX11-NEXT: bb.1 (align 32): ; GFX11-NEXT: successors: %bb.7(0x04000000), %bb.2(0x7c000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo @@ -62,7 +62,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.5: + ; GFX11-NEXT: bb.5 (align 32): ; GFX11-NEXT: successors: %bb.1(0x04000000), %bb.5(0x7c000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: S_NOP 0 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index 34a9624cb19eb..852453de5bb32 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -72,6 +72,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: s_inst_prefetch 0x2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_9: ; %for.body159 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_9 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 553d7e09390fd..e48c0a86d1724 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -122,6 +122,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0 +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44 ; CHECK-NEXT: s_add_i32 s5, s5, 1 @@ -161,6 +162,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s69, 0 ; CHECK-NEXT: s_mov_b32 s80, 0 ; CHECK-NEXT: s_branch .LBB0_8 +; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 ; CHECK-NEXT: s_add_i32 s80, s80, 4 @@ -372,6 +374,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.26: ; CHECK-NEXT: s_mov_b32 s52, 0 ; CHECK-NEXT: s_branch .LBB0_28 +; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -889,6 +892,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB1_3: ; %.53 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll index 4c2967a52fe93..6b8c23e33471e 100644 --- a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s @_RSENC_gDcd_______________________________ = external protected addrspace(1) externally_initialized global [4096 x i8], align 16 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index c92c672dda2ad..4d7acfbafae63 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s %"struct.__llvm_libc::rpc::Buffer" = type { [8 x i64] } diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 4c0ab91b7d622..cbe54b5c2647d 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=-unaligned-access-mode %s -o - | FileCheck -check-prefix=ALIGNED %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-memcpy-loop-unroll=3 %s -o - | FileCheck -check-prefix=UNROLL3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-disable-all-loop-alignment=true %s -o - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-disable-all-loop-alignment=true -mattr=-unaligned-access-mode %s -o - | FileCheck -check-prefix=ALIGNED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-disable-all-loop-alignment=true -amdgpu-memcpy-loop-unroll=3 %s -o - | FileCheck -check-prefix=UNROLL3 %s ; For checking that LowerMemIntrinsics lowers memcpy and memmove with large ; constant copy-sizes into loops with multiple load/store pairs. @@ -730,7 +730,6 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; UNROLL3: ; %bb.0: ; %entry ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB0_1: ; %load-store-loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -1484,7 +1483,6 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; UNROLL3: ; %bb.0: ; %entry ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB1_1: ; %load-store-loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -2091,7 +2089,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; UNROLL3: ; %bb.0: ; %entry ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB2_1: ; %load-store-loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -5353,8 +5350,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: s_inst_prefetch 0x1 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB4_1: ; %load-store-loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb @@ -5384,7 +5379,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; UNROLL3-NEXT: s_cbranch_vccnz .LBB4_1 ; UNROLL3-NEXT: ; %bb.2: ; %memcpy-split -; UNROLL3-NEXT: s_inst_prefetch 0x2 ; UNROLL3-NEXT: s_clause 0x3 ; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 ; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 @@ -6840,7 +6834,6 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: s_cbranch_execz .LBB5_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB5_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -6883,7 +6876,6 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB5_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -8339,7 +8331,6 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: s_cbranch_execz .LBB6_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB6_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -8382,7 +8373,6 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB6_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -9562,7 +9552,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: s_cbranch_execz .LBB7_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB7_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -9607,7 +9596,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2016 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 @@ -15917,8 +15905,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_cbranch_execz .LBB9_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader ; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 -; UNROLL3-NEXT: s_inst_prefetch 0x1 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB9_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb @@ -15947,7 +15933,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB9_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual -; UNROLL3-NEXT: s_inst_prefetch 0x2 ; UNROLL3-NEXT: s_clause 0x3 ; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 ; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 @@ -15986,8 +15971,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v2 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016 -; UNROLL3-NEXT: s_inst_prefetch 0x1 -; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB9_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb @@ -16016,7 +15999,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; UNROLL3-NEXT: s_cbranch_scc0 .LBB9_6 ; UNROLL3-NEXT: .LBB9_7: ; %Flow9 -; UNROLL3-NEXT: s_inst_prefetch 0x2 ; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index dd5c247f6ef35..b0b0d7016ddf2 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-disable-all-loop-alignment=true %s -o - | FileCheck %s ; Check code generation for memmoves with statically unknown size and all ; combinations of the following address spaces: @@ -38,7 +38,6 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5] @@ -64,7 +63,6 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] @@ -100,7 +98,6 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] @@ -126,7 +123,6 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 @@ -186,7 +182,6 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off @@ -212,7 +207,6 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off @@ -248,7 +242,6 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[4:5], off @@ -274,7 +267,6 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 @@ -335,7 +327,6 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[13:16], v4 @@ -360,7 +351,6 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v6, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v3, v2 @@ -394,7 +384,6 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v11, v4 @@ -419,7 +408,6 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[7:10], v2 @@ -478,7 +466,6 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off @@ -504,7 +491,6 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off @@ -540,7 +526,6 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off @@ -566,7 +551,6 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 @@ -627,7 +611,6 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 @@ -656,7 +639,6 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v6, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen @@ -690,7 +672,6 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen @@ -715,7 +696,6 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 @@ -778,7 +758,6 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5] @@ -804,7 +783,6 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] @@ -840,7 +818,6 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] @@ -866,7 +843,6 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 @@ -924,7 +900,6 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off @@ -950,7 +925,6 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off @@ -986,7 +960,6 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off @@ -1012,7 +985,6 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 @@ -1136,7 +1108,6 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off @@ -1162,7 +1133,6 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off @@ -1198,7 +1168,6 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off @@ -1224,7 +1193,6 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 @@ -1266,7 +1234,6 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB9_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 @@ -1358,7 +1325,6 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s9, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] @@ -1383,7 +1349,6 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] @@ -1417,7 +1382,6 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v11, v[9:10] @@ -1442,7 +1406,6 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, -1, v2, vcc_lo ; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 @@ -1744,7 +1707,6 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 @@ -1833,7 +1795,6 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] @@ -1861,7 +1822,6 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] @@ -1895,7 +1855,6 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v11, v[9:10] @@ -1920,7 +1879,6 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, -1, v2, vcc_lo ; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 @@ -1962,7 +1920,6 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 @@ -2033,7 +1990,6 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[9:12], v7 @@ -2100,7 +2056,6 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 @@ -2182,7 +2137,6 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB19_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 @@ -2269,7 +2223,6 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; CHECK-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB19_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 6110b3101020a..51d279a5d4ac5 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s ; Check that we do not copy agprs to vgprs and back inside the loop. diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index f4e5c276b8b75..cc3d5bd32f812 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GFX9 %s ; Make sure that AMDGPUCodeGenPrepare introduces mul24 intrinsics ; after SLSR, as the intrinsics would interfere. It's unclear if these diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index 306703bd61806..5ad6aadff039c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -61,6 +61,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: ; implicit-def: $sgpr2 ; GFX12-NEXT: s_branch .LBB0_2 +; GFX12-NEXT: .p2align 4 ; GFX12-NEXT: .LBB0_1: ; %Flow ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index cf244f0b1f884..f81e51cff3647 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942 -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908 define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll index 88cc06d8b3832..54357c6d48be1 100644 --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s ; Check that barrier or fence in between of loads is not considered a clobber ; for the purpose of converting vector loads into scalar. diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b1e05158b6212..70143b00f42a5 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s declare i64 @_Z13get_global_idj(i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index f0c8fed925673..dfeb684ef5aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s ; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671 -; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s -; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s}} +; xUN: llc -amdgpu-disable-all-loop-alignment=true -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s +; xUN: llc -amdgpu-disable-all-loop-alignment=true -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s}} define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_srem_i128_vv: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 19f0e93c308d8..e8681d366dd0a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOSDWA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX10 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX9 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDWA,GFX9_10,GFX10 %s define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: add_shr_i32: diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index f497752994852..ff38e33b07a82 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -amdgpu-scalar-ir-passes=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -amdgpu-scalar-ir-passes=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s define float @select_undef_lhs(float %val, i1 %cond) { ; GCN-LABEL: select_undef_lhs: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll index 192bd2073886a..8ea824488634b 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s define amdgpu_kernel void @copy_to_vreg_1(i32 %0) { ; GCN-LABEL: copy_to_vreg_1: diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 98c4868e213db..6a253599c3124 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s -check-prefix=GCN define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 { ; GCN-LABEL: should_not_hoist_set_inactive: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index b21c781f6223a..459fe75b8cdd2 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { ; GCN-LABEL: test_kill_depth_0_imm_pos: diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll index 42436a1b4c279..e98d03307d507 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s %pair = type { i32, i32 } diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index 4a5dc8f300af3..3412c2ad82189 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=4 -o - %s | FileCheck %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=4 -o - %s | FileCheck %s ; Make sure we can rematerialize split 64-bit constants (which ; MachineLICM hoisted out of the loop) and avoid spilling inside the diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index 0cf26be3ac24f..20a7f6e4e58ec 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s ; If the block containing the SI_RETURN_TO_EPILOG is not the last block, insert an empty block at the end and ; insert an unconditional jump there. define amdgpu_ps float @simple_test_return_to_epilog(float %a) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 469ea24634f62..2912cf98804ff 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc %s -o - -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true %s -o - -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s ; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA-TRAP-GFX803 %s -; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA-TRAP-GFX900 %s -; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s -; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s -; RUN: llc %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA-TRAP-GFX900 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 | FileCheck --check-prefix=HSA-TRAP-GFX1100 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true %s -o - -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 | FileCheck --check-prefix=HSA-TRAP-GFX1100-O0 %s declare void @llvm.trap() #0 declare void @llvm.debugtrap() #1 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index f6c357dc38b48..41c27e7e16a6e 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=1 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS1 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=0 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS0 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=1 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS1 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=0 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS0 %s declare void @wobble() diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index f001bf0d5e498..6684e12ef14fe 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -14,6 +14,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-NEXT: s_mov_b32 s4, 0 ; GFX90A-NEXT: s_mov_b32 s5, 0 ; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: .p2align 4 ; GFX90A-NEXT: .LBB0_1: ; %for.body ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -62,6 +63,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX942-NEXT: s_mov_b32 s4, 0 ; GFX942-NEXT: s_mov_b32 s5, 0 ; GFX942-NEXT: s_mov_b32 s6, 0 +; GFX942-NEXT: .p2align 4 ; GFX942-NEXT: .LBB0_1: ; %for.body ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index 490046ce5b856..57d075fa32018 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s define half @swap(half %a, half %b, i32 %i) { ; GFX11-TRUE16-LABEL: swap: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index b46f5f5640b66..e9b9c24a1095d 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-opt-vgpr-liverange=true < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-opt-vgpr-liverange=true < %s | FileCheck -check-prefix=SI %s ; a normal if-else define amdgpu_ps float @else1(i32 %z, float %v) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index d8264b5a091e1..4f0b3443f85f3 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX942-LABEL: v3i8_liveout: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 097154ed23ede..19c761e74cd6e 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 19c8e842a1390..f525d00014cd5 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -amdgpu-disable-all-loop-alignment=true -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GCN define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-LABEL: while_break: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index ad8dcd3888e9f..84c791341f1b9 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -763,6 +763,7 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB17_1: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -791,6 +792,7 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 5 ; GFX10-W32-NEXT: .LBB17_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 @@ -1229,6 +1231,7 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB25_1: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec @@ -1260,6 +1263,7 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 5 ; GFX10-W32-NEXT: .LBB25_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo @@ -1936,6 +1940,7 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB35_2 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB35_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB35_2 Depth=1 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf @@ -2660,6 +2665,7 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-W64-NEXT: .p2align 4 ; GFX9-W64-NEXT: .LBB47_1: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -2688,6 +2694,7 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 5 ; GFX10-W32-NEXT: .LBB47_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1