diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index d214ab9306c2f..b4c8ebaaa1f36 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -153,6 +153,7 @@ namespace llvm { /// ProcessImpicitDefs pass - This pass removes IMPLICIT_DEFs. extern char &ProcessImplicitDefsID; + extern char &MaxsMachineFunctionID; /// RegisterCoalescer - This pass merges live ranges to eliminate copies. extern char &RegisterCoalescerID; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index ea6afbfdc7861..e0331dcec48ad 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -246,6 +246,7 @@ void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &); void initializePrintFunctionPassWrapperPass(PassRegistry &); void initializePrintModulePassWrapperPass(PassRegistry &); void initializeProcessImplicitDefsPass(PassRegistry &); +void initializeMaxsMachineFunctionPass(PassRegistry &); void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &); void initializePromoteLegacyPassPass(PassRegistry &); void initializeRABasicPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 3ba2d259fe78c..114b3d3d3dc99 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -306,6 +306,7 @@ DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass) DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass) DUMMY_MACHINE_FUNCTION_PASS("print-machine-uniformity", MachineUniformityInfoPrinterPass) DUMMY_MACHINE_FUNCTION_PASS("processimpdefs", ProcessImplicitDefsPass) +// DUMMY_MACHINE_FUNCTION_PASS("processimpdefsmaxs", MaxsMachineFunctionPass) DUMMY_MACHINE_FUNCTION_PASS("prologepilog", PrologEpilogInserterPass) DUMMY_MACHINE_FUNCTION_PASS("prologepilog-code", PrologEpilogCodeInserterPass) DUMMY_MACHINE_FUNCTION_PASS("ra-basic", RABasicPass) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index dd04b2e81a2a7..00f3d37dba481 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -111,6 +111,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePostRASchedulerLegacyPass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeProcessImplicitDefsPass(Registry); + initializeMaxsMachineFunctionPass(Registry); initializeRABasicPass(Registry); initializeRAGreedyLegacyPass(Registry); initializeRegAllocFastPass(Registry); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5b2e0558d5664..6565afc808f9f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1511,6 +1511,8 @@ void GCNPassConfig::addFastRegAlloc() { TargetPassConfig::addFastRegAlloc(); } +extern FunctionPass *createMaxsMachineFunctionPass(); + void GCNPassConfig::addOptimizedRegAlloc() { if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); @@ -1545,6 +1547,8 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + addPass(createMaxsMachineFunctionPass()); + TargetPassConfig::addOptimizedRegAlloc(); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..ca7ff8e60a60a 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -114,6 +114,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp + MaxsMachineFunction.cpp GCNCreateVOPD.cpp GCNDPPCombine.cpp GCNHazardRecognizer.cpp diff --git a/llvm/lib/Target/AMDGPU/MaxsMachineFunction.cpp b/llvm/lib/Target/AMDGPU/MaxsMachineFunction.cpp new file mode 100644 index 0000000000000..d2a1db17b4b7c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/MaxsMachineFunction.cpp @@ -0,0 +1,190 @@ +//===---------------------- MaxsMachineFunctionPass.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "maxsmachinefunction" + +namespace { + +cl::opt UnpackFOps("amdgpu-unpack-fops", cl::Hidden, + cl::desc("unpack f ops"), cl::init(false)); + +struct MaxsMachineFunction : MachineFunctionPass { + static char ID; + + MaxsMachineFunction() : MachineFunctionPass(ID) { + // initializeMaxsMachineFunctionPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &au) const override; + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; +} // end anonymous namespace + +char MaxsMachineFunction::ID = 0; +char &llvm::MaxsMachineFunctionID = MaxsMachineFunction::ID; + +static void initializeMaxsMachineFunctionPassOnce(PassRegistry &Registry) { + PassInfo *PI = new PassInfo( + "MaxsMachineFunction", "maxsmachinefunction", &MaxsMachineFunction::ID, + PassInfo::NormalCtor_t(callDefaultCtor), false, + false); + Registry.registerPass(*PI, true); +} + +static llvm::once_flag InitializeMaxsMachineFunctionPassFlag; + +void llvm::initializeMaxsMachineFunctionPass(PassRegistry &Registry) { + llvm::call_once(InitializeMaxsMachineFunctionPassFlag, + initializeMaxsMachineFunctionPassOnce, std::ref(Registry)); +} + +FunctionPass *createMaxsMachineFunctionPass() { + return new MaxsMachineFunction(); +} + +void MaxsMachineFunction::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +std::optional findNthUser(MachineInstr &MI, + MachineRegisterInfo *MRI, + const Register &CRReg, + unsigned N = 1) { + MachineBasicBlock::iterator I = MI; + unsigned Idx = 0; + for (MachineBasicBlock::iterator EL = MI.getParent()->end(); I != EL; ++I) { + for (MachineRegisterInfo::use_instr_iterator + J = MRI->use_instr_begin(CRReg), + JE = MRI->use_instr_end(); + J != JE; ++J) + if (&*J == &*I) { + Idx++; + } + if (Idx == N) { + return &*I; + } + } + return {}; +} + +bool MaxsMachineFunction::runOnMachineFunction(MachineFunction &MF) { + + LLVM_DEBUG(dbgs() << "********** MaxsMachineFunction **********\n" + << "********** Function: " << MF.getName() << '\n'); + + bool Changed = false; + if (!UnpackFOps) + return Changed; + + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + SmallVector toRemove; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::V_PK_ADD_F32) { + MachineOperand &OldDest = MI.getOperand(0); + MachineOperand &Lhs = MI.getOperand(2); + MachineOperand &Rhs = MI.getOperand(4); + + auto lhsLow = MachineOperand::CreateReg( + Lhs.getReg(), Lhs.isDef(), Lhs.isImplicit(), Lhs.isKill(), + Lhs.isDead(), Lhs.isUndef(), Lhs.isEarlyClobber(), AMDGPU::sub0, + Lhs.isDebug(), Lhs.isInternalRead()); + + auto rhsLow = MachineOperand::CreateReg( + Rhs.getReg(), Rhs.isDef(), Rhs.isImplicit(), Rhs.isKill(), + Rhs.isDead(), Rhs.isUndef(), Rhs.isEarlyClobber(), AMDGPU::sub0, + Rhs.isDebug(), Rhs.isInternalRead()); + + Register DstReg1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineInstrBuilder MIB = + BuildMI(MBB, MI, {}, TII->get(AMDGPU::V_ADD_F32_e32), DstReg1) + .add({lhsLow, rhsLow}); + if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) + (void)MIB.setMIFlag(MachineInstr::MIFlag::NoFPExcept); + + auto lhsHigh = MachineOperand::CreateReg( + Lhs.getReg(), Lhs.isDef(), Lhs.isImplicit(), Lhs.isKill(), + Lhs.isDead(), Lhs.isUndef(), Lhs.isEarlyClobber(), AMDGPU::sub1, + Lhs.isDebug(), Lhs.isInternalRead()); + + auto rhsHigh = MachineOperand::CreateReg( + Rhs.getReg(), Rhs.isDef(), Rhs.isImplicit(), Rhs.isKill(), + Rhs.isDead(), Rhs.isUndef(), Rhs.isEarlyClobber(), AMDGPU::sub1, + Rhs.isDebug(), Rhs.isInternalRead()); + + Register DstReg2 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MIB = BuildMI(MBB, MI, {}, TII->get(AMDGPU::V_ADD_F32_e32), DstReg2) + .add({lhsHigh, rhsHigh}); + if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) + (void)MIB.setMIFlag(MachineInstr::MIFlag::NoFPExcept); + + Register nextOperand; + std::optional I; + if (I = findNthUser(MI, MRI, OldDest.getReg()); *I) { + nextOperand = (*I)->getOperand(0).getReg(); + (*I)->getOperand(1).ChangeToRegister(DstReg2, /*isDef*/ false); + } + + Register DstReg3 = + MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); + + auto reqSeq = BuildMI(MBB, *I, {}, TII->get(AMDGPU::REG_SEQUENCE)) + .addDef(DstReg3) + .addUse(DstReg1) + .addImm(AMDGPU::sub0) + .addUse(DstReg2) + .addImm(AMDGPU::sub1) + .getInstr(); + + if (auto I = findNthUser(MI, MRI, nextOperand, 2)) { + reqSeq->getOperand(0).dump(); + (*I)->getOperand(1).ChangeToRegister(reqSeq->getOperand(0).getReg(), + /*isDef*/ false); + } + + toRemove.push_back(&MI); + Changed = true; + } else if (MI.getOpcode() == AMDGPU::V_PK_MUL_F32) { + MI.dump(); + } + } + } + for (auto remove : toRemove) + remove->eraseFromParent(); + if (Changed) + MF.dump(); + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/test_v_pk.mir b/llvm/lib/Target/AMDGPU/test_v_pk.mir new file mode 100644 index 0000000000000..6df9f39034855 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/test_v_pk.mir @@ -0,0 +1,278 @@ +--- | + ; ModuleID = '/home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll' + source_filename = "/home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll" + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + target triple = "amdgcn" + + define amdgpu_kernel void @add_kernel(ptr addrspace(1) readonly captures(none) %0, ptr addrspace(1) readonly captures(none) %1, ptr addrspace(1) writeonly captures(none) %2, i32 %3) local_unnamed_addr #0 { + %add_kernel.kernarg.segment = call nonnull align 16 dereferenceable(284) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %.kernarg.offset7 = getelementptr inbounds i8, ptr addrspace(4) %add_kernel.kernarg.segment, i64 60, !amdgpu.uniform !0 + %.load8 = load i32, ptr addrspace(4) %.kernarg.offset7, align 4, !invariant.load !0 + %5 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %6 = shl i32 %5, 10 + %7 = tail call i32 @llvm.amdgcn.workitem.id.x() + %8 = shl i32 %7, 2 + %9 = and i32 %8, 1020 + %10 = or disjoint i32 %9, %6 + %11 = icmp slt i32 %10, %.load8 + %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11) + %13 = extractvalue { i1, i64 } %12, 0 + %14 = extractvalue { i1, i64 } %12, 1 + br i1 %13, label %.critedge, label %.critedge2 + + .critedge: ; preds = %4 + %.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %add_kernel.kernarg.segment, i64 36, !amdgpu.uniform !0 + %15 = load <3 x i64>, ptr addrspace(4) %.kernarg.offset, align 4, !invariant.load !0 + %.load9 = extractelement <3 x i64> %15, i32 0 + %16 = inttoptr i64 %.load9 to ptr addrspace(1) + %.load410 = extractelement <3 x i64> %15, i32 1 + %17 = inttoptr i64 %.load410 to ptr addrspace(1) + %.load611 = extractelement <3 x i64> %15, i32 2 + %18 = inttoptr i64 %.load611 to ptr addrspace(1) + %19 = sext i32 %10 to i64 + %20 = getelementptr float, ptr addrspace(1) %16, i64 %19 + %21 = load <2 x float>, ptr addrspace(1) %20, align 16 + %22 = extractelement <2 x float> %21, i32 0 + %23 = extractelement <2 x float> %21, i32 1 + %v_100 = insertelement <2 x float> undef, float %22, i32 0 + %v_102 = insertelement <2 x float> %v_100, float %23, i32 1 + %24 = getelementptr inbounds i8, ptr addrspace(1) %20, i64 8 + %25 = getelementptr float, ptr addrspace(1) %17, i64 %19 + %26 = getelementptr float, ptr addrspace(1) %18, i64 %19 + %27 = getelementptr inbounds i8, ptr addrspace(1) %25, i64 12 + %28 = load <2 x float>, ptr addrspace(1) %25, align 16 + %29 = extractelement <2 x float> %28, i32 0 + %30 = extractelement <2 x float> %28, i32 1 + %v_400 = insertelement <2 x float> undef, float %30, i32 0 + %v_402 = insertelement <2 x float> %v_400, float %29, i32 1 + %v_500 = fadd <2 x float> %v_102, %v_402 + %v_30 = extractelement <2 x float> %v_500, i32 1 + %v_28 = extractelement <2 x float> %v_500, i32 0 + store float %v_28, ptr addrspace(1) %26, align 4 + %p_301 = getelementptr i8, ptr addrspace(1) %26, i64 8 + %31 = insertelement <2 x float> poison, float %v_30, i32 0 + %32 = insertelement <2 x float> %31, float %v_28, i32 1 + store <2 x float> %32, ptr addrspace(1) %p_301, align 4 + br label %.critedge2, !amdgpu.uniform !0 + + .critedge2: ; preds = %.critedge, %4 + call void @llvm.amdgcn.end.cf.i64(i64 %14) + ret void + } + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1 + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare noundef i32 @llvm.amdgcn.workitem.id.x() #1 + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 + + ; Function Attrs: nocallback nofree nounwind willreturn + declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #3 + + ; Function Attrs: nocallback nofree nounwind willreturn + declare void @llvm.amdgcn.end.cf.i64(i64) #3 + + attributes #0 = { "target-cpu"="gfx942" } + attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx942" } + attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + attributes #3 = { nocallback nofree nounwind willreturn } + + !0 = !{} +... +--- +name: add_kernel +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 2, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 3, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] } + - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] } + - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] } + - { id: 7, class: sgpr_64, preferred-register: '', flags: [ ] } + - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] } + - { id: 11, class: sreg_32_xm0_xexec, preferred-register: '', flags: [ ] } + - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 13, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 15, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 17, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 18, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] } + - { id: 20, class: sreg_64_xexec, preferred-register: '', flags: [ ] } + - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 26, class: sreg_64, preferred-register: '', flags: [ ] } + - { id: 27, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 28, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 29, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 30, class: sreg_32, preferred-register: '', flags: [ ] } + - { id: 31, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 32, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 33, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 34, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 35, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 36, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 37, class: vreg_64_align2, preferred-register: '', flags: [ ] } + - { id: 38, class: vgpr_32, preferred-register: '', flags: [ ] } + - { id: 39, class: vreg_64_align2, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$vgpr0', virtual-reg: '%3' } + - { reg: '$sgpr4_sgpr5', virtual-reg: '%6' } + - { reg: '$sgpr8', virtual-reg: '%8' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + explicitKernArgSize: 28 + maxKernArgAlign: 8 + ldsSize: 0 + gdsSize: 0 + dynLDSAlign: 1 + isEntryFunction: true + isChainFunction: false + noSignedZerosFPMath: false + memoryBound: false + waveLimiter: false + hasSpilledSGPRs: false + hasSpilledVGPRs: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + bytesInStackArgArea: 0 + returnsVoid: true + argumentInfo: + dispatchPtr: { reg: '$sgpr0_sgpr1' } + queuePtr: { reg: '$sgpr2_sgpr3' } + kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } + dispatchID: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + workGroupIDY: { reg: '$sgpr9' } + workGroupIDZ: { reg: '$sgpr10' } + workItemIDX: { reg: '$vgpr0', mask: 1023 } + workItemIDY: { reg: '$vgpr0', mask: 1047552 } + workItemIDZ: { reg: '$vgpr0', mask: 1072693248 } + psInputAddr: 0 + psInputEnable: 0 + maxMemoryClusterDWords: 8 + mode: + ieee: true + dx10-clamp: true + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + highBitsOf32BitAddress: 0 + occupancy: 8 + vgprForAGPRCopy: '' + sgprForEXECCopy: '$sgpr100_sgpr101' + longBranchReservedReg: '' + hasInitWholeWave: false + scratchReservedForDynamicVGPRs: 0 +body: | + bb.0 (%ir-block.4): + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8 + + %8:sgpr_32 = COPY $sgpr8 + %6:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %3:vgpr_32(s32) = COPY $vgpr0 + %0:sreg_64 = COPY %6(p4) + %11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %6(p4), 60, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset7, addrspace 4) + %12:sreg_32 = S_MOV_B32 2 + %13:vgpr_32 = V_LSHLREV_B32_e64 killed %12, %3(s32), implicit $exec + %14:sreg_32 = S_MOV_B32 1020 + %15:vgpr_32 = V_AND_B32_e64 killed %13, killed %14, implicit $exec + %16:sreg_32 = S_MOV_B32 10 + %17:vgpr_32 = COPY killed %16 + %1:vgpr_32 = V_LSHL_OR_B32_e64 %8, %17, killed %15, implicit $exec + %18:sreg_64 = V_CMP_LT_I32_e64 %1, killed %11, implicit $exec + %2:sreg_64 = SI_IF killed %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1..critedge: + successors: %bb.2(0x80000000) + + early-clobber %19:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0, 36, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset, align 4, addrspace 4) + early-clobber %20:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec %0, 52, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset + 16, align 4, addrspace 4) + %21:sreg_32 = COPY %19.sub3 + %22:sreg_32 = COPY %19.sub2 + %23:sreg_32 = COPY %19.sub1 + %24:sreg_32 = COPY %19.sub0 + %25:sreg_64 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1 + %26:sreg_64 = REG_SEQUENCE killed %22, %subreg.sub0, killed %21, %subreg.sub1 + %27:vgpr_32 = V_ASHRREV_I32_e64 31, %1, implicit $exec + %28:vgpr_32 = COPY %27 + %29:vreg_64_align2 = REG_SEQUENCE %1, %subreg.sub0, killed %28, %subreg.sub1 + %30:sreg_32 = S_MOV_B32 2 + %31:vreg_64_align2 = V_LSHLREV_B64_e64 killed %30, killed %29, implicit $exec + %32:vreg_64_align2 = V_LSHL_ADD_U64_e64 killed %25, 0, %31, implicit $exec + %33:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 killed %32, 0, 0, implicit $exec :: (load (s64) from %ir.20, align 16, addrspace 1) + %34:vreg_64_align2 = V_LSHL_ADD_U64_e64 killed %26, 0, %31, implicit $exec + %35:vreg_64_align2 = V_LSHL_ADD_U64_e64 killed %20, 0, %31, implicit $exec + %36:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 killed %34, 0, 0, implicit $exec :: (load (s64) from %ir.25, align 16, addrspace 1) + %37:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed %33, 4, killed %36, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %38:vgpr_32 = COPY %37.sub0 + GLOBAL_STORE_DWORD %35, killed %38, 0, 0, implicit $exec :: (store (s32) into %ir.26, addrspace 1) + %39:vreg_64_align2 = V_PK_MOV_B32 12, %37, 8, %37, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %35, killed %39, 8, 0, implicit $exec :: (store (s64) into %ir.p_301, align 4, addrspace 1) + + bb.2..critedge2: + SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/lib/Target/AMDGPU/test_v_pk_fadd.ll b/llvm/lib/Target/AMDGPU/test_v_pk_fadd.ll new file mode 100644 index 0000000000000..df8ee6a8ed02c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/test_v_pk_fadd.ll @@ -0,0 +1,92 @@ +; test_v_pk.ll -mtriple=amdgcn -mcpu=gfx942 -o - +; test_v_pk.ll -mattr=-packed-fp32-ops -mtriple=amdgcn -mcpu=gfx942 -o - +; test_v_pk.ll -mtriple=amdgcn -mcpu=gfx942 -stop-after=finalize-isel -o - +; test_v_pk.mir -mtriple=amdgcn -mcpu=gfx942 -x mir -filetype=null -o - + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 { + %5 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %6 = shl i32 %5, 10 + %7 = tail call i32 @llvm.amdgcn.workitem.id.x() + %8 = shl i32 %7, 2 + %9 = and i32 %8, 1020 + %10 = or disjoint i32 %9, %6 + %11 = icmp slt i32 %10, %3 + br i1 %11, label %.critedge, label %.critedge2 + +.critedge: ; preds = %4 + %12 = or disjoint i32 %10, 3 + %13 = or disjoint i32 %10, 2 + %14 = or disjoint i32 %10, 1 + %15 = sext i32 %10 to i64 + %16 = getelementptr float, ptr addrspace(1) %0, i64 %15 + %17 = addrspacecast ptr addrspace(1) %16 to ptr + %18 = load float, ptr %17, align 16 + %19 = getelementptr inbounds i8, ptr %17, i64 4 + %20 = load float, ptr %19, align 4 + + %v_100 = insertelement <2 x float> undef, float %18, i32 0 + %v_102 = insertelement <2 x float> %v_100, float %20, i32 1 + + %21 = getelementptr inbounds i8, ptr %17, i64 8 + %22 = load float, ptr %21, align 8 + %23 = getelementptr inbounds i8, ptr %17, i64 12 + %24 = load float, ptr %23, align 4 + + %v_200 = insertelement <2 x float> undef, float %22, i32 0 + %v_202 = insertelement <2 x float> %v_200, float %24, i32 1 + + %25 = getelementptr float, ptr addrspace(1) %1, i64 %15 + %26 = addrspacecast ptr addrspace(1) %25 to ptr + %27 = sext i32 %12 to i64 + %28 = getelementptr float, ptr addrspace(1) %2, i64 %27 + %29 = sext i32 %13 to i64 + %30 = getelementptr float, ptr addrspace(1) %2, i64 %29 + %31 = sext i32 %14 to i64 + %32 = getelementptr float, ptr addrspace(1) %2, i64 %31 + %33 = getelementptr inbounds i8, ptr %26, i64 12 + %34 = load float, ptr %33, align 4 + + %36 = getelementptr inbounds i8, ptr %26, i64 8 + %37 = load float, ptr %36, align 8 + + %v_300 = insertelement <2 x float> undef, float %34, i32 0 + %v_302 = insertelement <2 x float> %v_300, float %37, i32 1 + + %39 = getelementptr inbounds i8, ptr %26, i64 4 + %40 = load float, ptr %39, align 4 + %42 = load float, ptr %26, align 16 + + %v_400 = insertelement <2 x float> undef, float %40, i32 0 + %v_402 = insertelement <2 x float> %v_400, float %42, i32 1 + + %v_500 = fadd <2 x float> %v_102, %v_402 + ; %v_501 = fadd <2 x float> %v_202, %v_302 + ; tail call void @llvm.amdgcn.iglp.opt(i32 4) + + ; %v_45 = extractelement <2 x float> %v_501, i32 1 + ; %v_32 = extractelement <2 x float> %v_501, i32 0 + %v_30 = extractelement <2 x float> %v_500, i32 1 + %v_28 = extractelement <2 x float> %v_500, i32 0 + + %i_44 = sext i32 %10 to i64 + %p_45 = getelementptr float, ptr addrspace(1) %2, i64 %i_44 + store float %v_28, ptr addrspace(1) %p_45, align 4 + + ; %i_31 = sext i32 %14 to i64 + ; %p_32 = getelementptr float, ptr addrspace(1) %2, i64 %i_31 + ; store float %v_32, ptr addrspace(1) %p_32, align 4 + + %i_29 = sext i32 %13 to i64 + %p_30 = getelementptr float, ptr addrspace(1) %2, i64 %i_29 + store float %v_30, ptr addrspace(1) %p_30, align 4 + + %i_27 = sext i32 %12 to i64 + %p_28 = getelementptr float, ptr addrspace(1) %2, i64 %i_27 + store float %v_28, ptr addrspace(1) %p_28, align 4 + + br label %.critedge2 + +.critedge2: ; preds = %4, %.critedge + ret void +} diff --git a/llvm/lib/Target/AMDGPU/test_v_pk_fmul.ll b/llvm/lib/Target/AMDGPU/test_v_pk_fmul.ll new file mode 100644 index 0000000000000..b40394606471f --- /dev/null +++ b/llvm/lib/Target/AMDGPU/test_v_pk_fmul.ll @@ -0,0 +1,3222 @@ +; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk_fmul.ll -mtriple=amdgcn -mcpu=gfx950 -o - +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare !dbg !6 float @llvm.amdgcn.exp2.f32(float) #0 + +; Function Attrs: nofree norecurse nounwind +define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr #1 !dbg !9 { + %29 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !10 + %30 = tail call i32 @llvm.amdgcn.workgroup.id.y(), !dbg !11 + %31 = tail call i32 @llvm.amdgcn.workgroup.id.z(), !dbg !12 + %32 = shl i32 %29, 8, !dbg !13 + %33 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !14 + %34 = and i32 %33, 63, !dbg !14 + %35 = and i32 %33, 16, !dbg !14 + %36 = and i32 %33, 31, !dbg !14 + %37 = lshr i32 %33, 1, !dbg !14 + %38 = and i32 %37, 64, !dbg !14 + %39 = and i32 %33, 256, !dbg !14 + %40 = icmp eq i32 %39, 0, !dbg !14 + %41 = lshr i32 %33, 1, !dbg !14 + %42 = and i32 %41, 160, !dbg !14 + %43 = or disjoint i32 %42, %36, !dbg !14 + %44 = or disjoint i32 %43, %38, !dbg !14 + %45 = lshr i32 %33, 4, !dbg !14 + %46 = and i32 %45, 31, !dbg !14 + %47 = or disjoint i32 %46, 32, !dbg !14 + %48 = or disjoint i32 %46, 64, !dbg !14 + %49 = or disjoint i32 %46, 96, !dbg !14 + %50 = or disjoint i32 %46, 128, !dbg !14 + %51 = or disjoint i32 %46, 160, !dbg !14 + %52 = or disjoint i32 %46, 192, !dbg !14 + %53 = or disjoint i32 %46, 224, !dbg !14 + %54 = and i32 %33, 255, !dbg !14 + %55 = or disjoint i32 %44, %32, !dbg !15 + %56 = or disjoint i32 %46, %32, !dbg !15 + %57 = or disjoint i32 %47, %32, !dbg !15 + %58 = or disjoint i32 %48, %32, !dbg !15 + %59 = or disjoint i32 %49, %32, !dbg !15 + %60 = or disjoint i32 %50, %32, !dbg !15 + %61 = or disjoint i32 %51, %32, !dbg !15 + %62 = or disjoint i32 %52, %32, !dbg !15 + %63 = or disjoint i32 %53, %32, !dbg !15 + %64 = mul i32 %5, %31, !dbg !16 + %65 = sext i32 %64 to i64, !dbg !17 + %66 = getelementptr half, ptr addrspace(1) %0, i64 %65, !dbg !17 + %67 = mul i32 %6, %30, !dbg !18 + %68 = sext i32 %67 to i64, !dbg !19 + %69 = getelementptr half, ptr addrspace(1) %66, i64 %68, !dbg !19 + %70 = mul i32 %7, %32, !dbg !20 + %71 = mul i32 %7, %46, !dbg !20 + %72 = mul i32 %7, %47, !dbg !20 + %73 = mul i32 %7, %48, !dbg !20 + %74 = mul i32 %7, %49, !dbg !20 + %75 = mul i32 %7, %50, !dbg !20 + %76 = mul i32 %7, %51, !dbg !20 + %77 = mul i32 %7, %52, !dbg !20 + %78 = mul i32 %7, %53, !dbg !20 + %79 = sext i32 %70 to i64, !dbg !20 + %80 = getelementptr half, ptr addrspace(1) %69, i64 %79, !dbg !20 + %81 = lshr i32 %33, 3, !dbg !21 + %82 = and i32 %81, 4, !dbg !21 + %83 = shl i32 %33, 3, !dbg !21 + %84 = and i32 %83, 32, !dbg !21 + %85 = and i32 %83, 64, !dbg !21 + %86 = and i32 %83, 120, !dbg !21 + %87 = add i32 %71, %86, !dbg !22 + %88 = add i32 %72, %86, !dbg !22 + %89 = add i32 %73, %86, !dbg !22 + %90 = add i32 %74, %86, !dbg !22 + %91 = add i32 %75, %86, !dbg !22 + %92 = add i32 %76, %86, !dbg !22 + %93 = add i32 %77, %86, !dbg !22 + %94 = add i32 %78, %86, !dbg !22 + %95 = mul i32 %8, %31, !dbg !23 + %96 = sext i32 %95 to i64, !dbg !24 + %97 = getelementptr half, ptr addrspace(1) %1, i64 %96, !dbg !24 + %98 = mul i32 %9, %30, !dbg !25 + %99 = sext i32 %98 to i64, !dbg !26 + %100 = getelementptr half, ptr addrspace(1) %97, i64 %99, !dbg !26 + %101 = mul i32 %10, %46, !dbg !27 + %102 = mul i32 %10, %47, !dbg !27 + %103 = add i32 %101, %86, !dbg !27 + %104 = add i32 %102, %86, !dbg !27 + %105 = mul i32 %11, %31, !dbg !28 + %106 = sext i32 %105 to i64, !dbg !29 + %107 = getelementptr half, ptr addrspace(1) %2, i64 %106, !dbg !29 + %108 = mul i32 %12, %30, !dbg !30 + %109 = sext i32 %108 to i64, !dbg !31 + %110 = getelementptr half, ptr addrspace(1) %107, i64 %109, !dbg !31 + %111 = mul i32 %13, %46, !dbg !32 + %112 = mul i32 %13, %47, !dbg !32 + %113 = add i32 %111, %86, !dbg !33 + %114 = add i32 %112, %86, !dbg !33 + %115 = icmp slt i32 %55, 16384, !dbg !34 + %116 = icmp slt i32 %56, 16384, !dbg !34 + %117 = icmp slt i32 %57, 16384, !dbg !34 + %118 = icmp slt i32 %58, 16384, !dbg !34 + %119 = icmp slt i32 %59, 16384, !dbg !34 + %120 = icmp slt i32 %60, 16384, !dbg !34 + %121 = icmp slt i32 %61, 16384, !dbg !34 + %122 = icmp slt i32 %62, 16384, !dbg !34 + %123 = icmp slt i32 %63, 16384, !dbg !34 + %124 = shl i32 %10, 6, !dbg !35 + %125 = shl i32 %13, 6, !dbg !38 + %126 = mul i32 %31, 786432, !dbg !39 + %127 = sext i32 %126 to i64, !dbg !40 + %128 = getelementptr float, ptr addrspace(1) %3, i64 %127, !dbg !40 + %129 = shl i32 %30, 14, !dbg !41 + %130 = sext i32 %129 to i64, !dbg !42 + %131 = getelementptr float, ptr addrspace(1) %128, i64 %130, !dbg !42 + %132 = sext i32 %32 to i64, !dbg !43 + %133 = getelementptr float, ptr addrspace(1) %131, i64 %132, !dbg !43 + %134 = add i32 %32, -16128, !dbg !44 + %135 = icmp slt i32 %134, 1, !dbg !45 + %136 = trunc i32 %7 to i16, !dbg !46 + %137 = and i16 %136, 16383, !dbg !46 + %138 = or disjoint i16 %137, 16384, !dbg !46 + %139 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %80, i16 %138, i32 2147483646, i32 159744), !dbg !46 + %140 = shl i32 %87, 1, !dbg !46 + %141 = select i1 %116, i32 %140, i32 -2147483648, !dbg !46 + %142 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %141, i32 0, i32 0), !dbg !46 + %143 = shl i32 %88, 1, !dbg !46 + %144 = select i1 %117, i32 %143, i32 -2147483648, !dbg !46 + %145 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %144, i32 0, i32 0), !dbg !46 + %146 = shl i32 %89, 1, !dbg !46 + %147 = select i1 %118, i32 %146, i32 -2147483648, !dbg !46 + %148 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %147, i32 0, i32 0), !dbg !46 + %149 = shl i32 %90, 1, !dbg !46 + %150 = select i1 %119, i32 %149, i32 -2147483648, !dbg !46 + %151 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %150, i32 0, i32 0), !dbg !46 + %152 = shl i32 %91, 1, !dbg !46 + %153 = select i1 %120, i32 %152, i32 -2147483648, !dbg !46 + %154 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %153, i32 0, i32 0), !dbg !46 + %155 = shl i32 %92, 1, !dbg !46 + %156 = select i1 %121, i32 %155, i32 -2147483648, !dbg !46 + %157 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %156, i32 0, i32 0), !dbg !46 + %158 = shl i32 %93, 1, !dbg !46 + %159 = select i1 %122, i32 %158, i32 -2147483648, !dbg !46 + %160 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %159, i32 0, i32 0), !dbg !46 + %161 = shl i32 %94, 1, !dbg !46 + %162 = select i1 %123, i32 %161, i32 -2147483648, !dbg !46 + %163 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %162, i32 0, i32 0), !dbg !46 + fence syncscope("workgroup") release, !dbg !46 + tail call void @llvm.amdgcn.s.barrier(), !dbg !46 + fence syncscope("workgroup") acquire, !dbg !46 + %164 = and i32 %41, 56, !dbg !46 + %165 = or disjoint i32 %38, %164, !dbg !46 + %166 = xor i32 %165, %86, !dbg !46 + %167 = shl nuw nsw i32 %46, 7, !dbg !46 + %168 = or disjoint i32 %166, %167, !dbg !46 + %.idx = shl nuw nsw i32 %168, 1, !dbg !46 + %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !46 + store <4 x i32> %142, ptr addrspace(3) %169, align 16, !dbg !46 + %170 = or disjoint i32 %167, 4096, !dbg !46 + %171 = or disjoint i32 %170, %166, !dbg !46 + %.idx3 = shl nuw nsw i32 %171, 1, !dbg !46 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %.idx3, !dbg !46 + store <4 x i32> %145, ptr addrspace(3) %172, align 16, !dbg !46 + %173 = or disjoint i32 %168, 8192, !dbg !46 + %174 = getelementptr inbounds nuw half, ptr addrspace(3) @global_smem, i32 %173, !dbg !46 + store <4 x i32> %148, ptr addrspace(3) %174, align 16, !dbg !46 + %175 = or disjoint i32 %168, 12288, !dbg !46 + %176 = getelementptr inbounds nuw half, ptr addrspace(3) @global_smem, i32 %175, !dbg !46 + store <4 x i32> %151, ptr addrspace(3) %176, align 16, !dbg !46 + %177 = or disjoint i32 %168, 16384, !dbg !46 + %178 = getelementptr inbounds nuw half, ptr addrspace(3) @global_smem, i32 %177, !dbg !46 + store <4 x i32> %154, ptr addrspace(3) %178, align 16, !dbg !46 + %179 = or disjoint i32 %168, 20480, !dbg !46 + %180 = getelementptr inbounds nuw half, ptr addrspace(3) @global_smem, i32 %179, !dbg !46 + store <4 x i32> %157, ptr addrspace(3) %180, align 16, !dbg !46 + %181 = or disjoint i32 %168, 24576, !dbg !46 + %182 = getelementptr inbounds nuw half, ptr addrspace(3) @global_smem, i32 %181, !dbg !46 + store <4 x i32> %160, ptr addrspace(3) %182, align 16, !dbg !46 + %183 = or disjoint i32 %168, 28672, !dbg !46 + %184 = getelementptr inbounds nuw half, ptr addrspace(3) @global_smem, i32 %183, !dbg !46 + store <4 x i32> %163, ptr addrspace(3) %184, align 16, !dbg !46 + fence syncscope("workgroup") release, !dbg !46 + tail call void @llvm.amdgcn.s.barrier(), !dbg !46 + fence syncscope("workgroup") acquire, !dbg !46 + %185 = and i32 %41, 224, !dbg !46 + %186 = lshr i32 %33, 2, !dbg !46 + %187 = and i32 %186, 8, !dbg !46 + %188 = or disjoint i32 %185, %36, !dbg !46 + %189 = or disjoint i32 %187, 16, !dbg !46 + %190 = or disjoint i32 %187, 32, !dbg !46 + %191 = or disjoint i32 %187, 48, !dbg !46 + %192 = or disjoint i32 %187, 64, !dbg !46 + %193 = or disjoint i32 %187, 80, !dbg !46 + %194 = or disjoint i32 %187, 96, !dbg !46 + %195 = or disjoint i32 %187, 112, !dbg !46 + %196 = shl i32 %33, 3, !dbg !46 + %197 = and i32 %196, 120, !dbg !46 + %198 = xor i32 %187, %197, !dbg !46 + %199 = shl nuw nsw i32 %188, 7, !dbg !46 + %200 = or disjoint i32 %199, %198, !dbg !46 + %201 = xor i32 %189, %197, !dbg !46 + %202 = or disjoint i32 %199, %201, !dbg !46 + %203 = xor i32 %190, %197, !dbg !46 + %204 = or disjoint i32 %199, %203, !dbg !46 + %205 = xor i32 %191, %197, !dbg !46 + %206 = or disjoint i32 %199, %205, !dbg !46 + %207 = xor i32 %192, %197, !dbg !46 + %208 = or disjoint i32 %199, %207, !dbg !46 + %209 = xor i32 %193, %197, !dbg !46 + %210 = or disjoint i32 %199, %209, !dbg !46 + %211 = xor i32 %194, %197, !dbg !46 + %212 = or disjoint i32 %199, %211, !dbg !46 + %213 = xor i32 %195, %197, !dbg !46 + %214 = or disjoint i32 %199, %213, !dbg !46 + %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %200, !dbg !46 + %216 = load <8 x half>, ptr addrspace(3) %215, align 16, !dbg !46 + %217 = getelementptr half, ptr addrspace(3) @global_smem, i32 %202, !dbg !46 + %218 = load <8 x half>, ptr addrspace(3) %217, align 16, !dbg !46 + %219 = getelementptr half, ptr addrspace(3) @global_smem, i32 %204, !dbg !46 + %220 = load <8 x half>, ptr addrspace(3) %219, align 16, !dbg !46 + %221 = getelementptr half, ptr addrspace(3) @global_smem, i32 %206, !dbg !46 + %222 = load <8 x half>, ptr addrspace(3) %221, align 16, !dbg !46 + %223 = getelementptr half, ptr addrspace(3) @global_smem, i32 %208, !dbg !46 + %224 = load <8 x half>, ptr addrspace(3) %223, align 16, !dbg !46 + %225 = getelementptr half, ptr addrspace(3) @global_smem, i32 %210, !dbg !46 + %226 = load <8 x half>, ptr addrspace(3) %225, align 16, !dbg !46 + %227 = getelementptr half, ptr addrspace(3) @global_smem, i32 %212, !dbg !46 + %228 = load <8 x half>, ptr addrspace(3) %227, align 16, !dbg !46 + %229 = getelementptr half, ptr addrspace(3) @global_smem, i32 %214, !dbg !46 + %230 = load <8 x half>, ptr addrspace(3) %229, align 16, !dbg !46 + %231 = or disjoint i32 %167, %86, !dbg !47 + %.idx2 = shl nuw nsw i32 %231, 1, !dbg !47 + %232 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %.idx2, !dbg !47 + %233 = or disjoint i32 %170, %86, !dbg !47 + %.idx4 = shl nuw nsw i32 %233, 1, !dbg !47 + %234 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %.idx4, !dbg !47 + %235 = trunc i32 %10 to i16, !dbg !47 + %236 = and i16 %235, 16383, !dbg !47 + %237 = or disjoint i16 %236, 16384, !dbg !47 + %238 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %100, i16 %237, i32 2147483646, i32 159744), !dbg !47 + %gepdiff = sub nsw i32 %.idx, %.idx2, !dbg !47 + %.lhs.trunc = trunc nsw i32 %gepdiff to i16, !dbg !47 + %239 = sdiv i16 %.lhs.trunc, 16, !dbg !47 + %.sext = sext i16 %239 to i32, !dbg !47 + %240 = add nsw i32 %34, %.sext, !dbg !47 + %241 = shl nsw i32 %240, 2, !dbg !47 + %242 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %241, i32 %103), !dbg !47 + %243 = tail call i64 @llvm.amdgcn.ballot.i64(i1 true), !dbg !47 + %244 = zext nneg i32 %240 to i64, !dbg !47 + %245 = lshr i64 %243, %244, !dbg !47 + %246 = trunc i64 %245 to i1, !dbg !47 + %247 = shl i32 %242, 1, !dbg !47 + %248 = select i1 %246, i32 %247, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %238, ptr addrspace(3) %232, i32 16, i32 %248, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %gepdiff5 = sub nsw i32 %.idx3, %.idx4, !dbg !47 + %.lhs.trunc374 = trunc nsw i32 %gepdiff5 to i16, !dbg !47 + %249 = sdiv i16 %.lhs.trunc374, 16, !dbg !47 + %.sext375 = sext i16 %249 to i32, !dbg !47 + %250 = add nsw i32 %34, %.sext375, !dbg !47 + %251 = shl nsw i32 %250, 2, !dbg !47 + %252 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %251, i32 %104), !dbg !47 + %253 = zext nneg i32 %250 to i64, !dbg !47 + %254 = lshr i64 %243, %253, !dbg !47 + %255 = trunc i64 %254 to i1, !dbg !47 + %256 = shl i32 %252, 1, !dbg !47 + %257 = select i1 %255, i32 %256, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %238, ptr addrspace(3) nonnull %234, i32 16, i32 %257, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %258 = sext i32 %124 to i64, !dbg !51 + %259 = getelementptr half, ptr addrspace(1) %100, i64 %258, !dbg !51 + %260 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %.idx2, !dbg !47 + %261 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %.idx4, !dbg !47 + %262 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %259, i16 %237, i32 2147483646, i32 159744), !dbg !47 + %263 = sub nsw i32 %166, %86, !dbg !47 + %264 = ashr exact i32 %263, 3, !dbg !47 + %265 = add nsw i32 %264, %34, !dbg !47 + %266 = shl nsw i32 %265, 2, !dbg !47 + %267 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %266, i32 %103), !dbg !47 + %268 = zext nneg i32 %265 to i64, !dbg !47 + %269 = lshr i64 %243, %268, !dbg !47 + %270 = trunc i64 %269 to i1, !dbg !47 + %271 = shl i32 %267, 1, !dbg !47 + %272 = select i1 %270, i32 %271, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %262, ptr addrspace(3) %260, i32 16, i32 %272, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %273 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %266, i32 %104), !dbg !47 + %274 = shl i32 %273, 1, !dbg !47 + %275 = select i1 %270, i32 %274, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %262, ptr addrspace(3) nonnull %261, i32 16, i32 %275, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + tail call void @llvm.amdgcn.s.waitcnt(i32 -49166), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + %276 = shl nuw nsw i32 %36, 7, !dbg !47 + %277 = or disjoint i32 %198, %276, !dbg !47 + %278 = or disjoint i32 %201, %276, !dbg !47 + %279 = or disjoint i32 %203, %276, !dbg !47 + %280 = or disjoint i32 %205, %276, !dbg !47 + %281 = or disjoint i32 %207, %276, !dbg !47 + %282 = or disjoint i32 %209, %276, !dbg !47 + %283 = or disjoint i32 %211, %276, !dbg !47 + %284 = or disjoint i32 %213, %276, !dbg !47 + %285 = or disjoint i32 %276, 4096, !dbg !47 + %286 = or disjoint i32 %198, %285, !dbg !47 + %287 = or disjoint i32 %201, %285, !dbg !47 + %288 = or disjoint i32 %203, %285, !dbg !47 + %289 = or disjoint i32 %205, %285, !dbg !47 + %290 = or disjoint i32 %207, %285, !dbg !47 + %291 = or disjoint i32 %209, %285, !dbg !47 + %292 = or disjoint i32 %211, %285, !dbg !47 + %293 = or disjoint i32 %213, %285, !dbg !47 + %294 = getelementptr half, ptr addrspace(3) @global_smem, i32 %277, !dbg !47 + %295 = load <8 x half>, ptr addrspace(3) %294, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %296 = getelementptr half, ptr addrspace(3) @global_smem, i32 %278, !dbg !47 + %297 = load <8 x half>, ptr addrspace(3) %296, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %298 = getelementptr half, ptr addrspace(3) @global_smem, i32 %279, !dbg !47 + %299 = load <8 x half>, ptr addrspace(3) %298, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %300 = getelementptr half, ptr addrspace(3) @global_smem, i32 %280, !dbg !47 + %301 = load <8 x half>, ptr addrspace(3) %300, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %302 = getelementptr half, ptr addrspace(3) @global_smem, i32 %281, !dbg !47 + %303 = load <8 x half>, ptr addrspace(3) %302, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %304 = getelementptr half, ptr addrspace(3) @global_smem, i32 %282, !dbg !47 + %305 = load <8 x half>, ptr addrspace(3) %304, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %306 = getelementptr half, ptr addrspace(3) @global_smem, i32 %283, !dbg !47 + %307 = load <8 x half>, ptr addrspace(3) %306, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %308 = getelementptr half, ptr addrspace(3) @global_smem, i32 %284, !dbg !47 + %309 = load <8 x half>, ptr addrspace(3) %308, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %310 = getelementptr half, ptr addrspace(3) @global_smem, i32 %286, !dbg !47 + %311 = load <8 x half>, ptr addrspace(3) %310, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %312 = getelementptr half, ptr addrspace(3) @global_smem, i32 %287, !dbg !47 + %313 = load <8 x half>, ptr addrspace(3) %312, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %314 = getelementptr half, ptr addrspace(3) @global_smem, i32 %288, !dbg !47 + %315 = load <8 x half>, ptr addrspace(3) %314, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %316 = getelementptr half, ptr addrspace(3) @global_smem, i32 %289, !dbg !47 + %317 = load <8 x half>, ptr addrspace(3) %316, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %318 = getelementptr half, ptr addrspace(3) @global_smem, i32 %290, !dbg !47 + %319 = load <8 x half>, ptr addrspace(3) %318, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %320 = getelementptr half, ptr addrspace(3) @global_smem, i32 %291, !dbg !47 + %321 = load <8 x half>, ptr addrspace(3) %320, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %322 = getelementptr half, ptr addrspace(3) @global_smem, i32 %292, !dbg !47 + %323 = load <8 x half>, ptr addrspace(3) %322, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %324 = getelementptr half, ptr addrspace(3) @global_smem, i32 %293, !dbg !47 + %325 = load <8 x half>, ptr addrspace(3) %324, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %326 = shl i32 %33, 1, !dbg !47 + %327 = and i32 %326, 96, !dbg !47 + %328 = xor i32 %86, %327, !dbg !47 + %329 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx2, !dbg !47 + %330 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx4, !dbg !47 + %331 = trunc i32 %13 to i16, !dbg !47 + %332 = and i16 %331, 16383, !dbg !47 + %333 = or disjoint i16 %332, 16384, !dbg !47 + %334 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %110, i16 %333, i32 2147483646, i32 159744), !dbg !47 + %335 = sub nsw i32 %328, %86, !dbg !47 + %336 = ashr exact i32 %335, 3, !dbg !47 + %337 = add nsw i32 %336, %34, !dbg !47 + %338 = shl nsw i32 %337, 2, !dbg !47 + %339 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %338, i32 %113), !dbg !47 + %340 = zext nneg i32 %337 to i64, !dbg !47 + %341 = lshr i64 %243, %340, !dbg !47 + %342 = trunc i64 %341 to i1, !dbg !47 + %343 = shl i32 %339, 1, !dbg !47 + %344 = select i1 %342, i32 %343, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %334, ptr addrspace(3) %329, i32 16, i32 %344, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %345 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %338, i32 %114), !dbg !47 + %346 = shl i32 %345, 1, !dbg !47 + %347 = select i1 %342, i32 %346, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %334, ptr addrspace(3) nonnull %330, i32 16, i32 %347, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %348 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %295, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %349 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %297, <8 x half> %218, <16 x float> %348, i32 0, i32 0, i32 0), !dbg !54 + %350 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %299, <8 x half> %220, <16 x float> %349, i32 0, i32 0, i32 0), !dbg !54 + %351 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %301, <8 x half> %222, <16 x float> %350, i32 0, i32 0, i32 0), !dbg !54 + %352 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %303, <8 x half> %224, <16 x float> %351, i32 0, i32 0, i32 0), !dbg !54 + %353 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %305, <8 x half> %226, <16 x float> %352, i32 0, i32 0, i32 0), !dbg !54 + %354 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %307, <8 x half> %228, <16 x float> %353, i32 0, i32 0, i32 0), !dbg !54 + %355 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %309, <8 x half> %230, <16 x float> %354, i32 0, i32 0, i32 0), !dbg !54 + %356 = extractelement <16 x float> %355, i64 0, !dbg !54 + %357 = extractelement <16 x float> %355, i64 1, !dbg !54 + %358 = extractelement <16 x float> %355, i64 2, !dbg !54 + %359 = extractelement <16 x float> %355, i64 3, !dbg !54 + %360 = extractelement <16 x float> %355, i64 4, !dbg !54 + %361 = extractelement <16 x float> %355, i64 5, !dbg !54 + %362 = extractelement <16 x float> %355, i64 6, !dbg !54 + %363 = extractelement <16 x float> %355, i64 7, !dbg !54 + %364 = extractelement <16 x float> %355, i64 8, !dbg !54 + %365 = extractelement <16 x float> %355, i64 9, !dbg !54 + %366 = extractelement <16 x float> %355, i64 10, !dbg !54 + %367 = extractelement <16 x float> %355, i64 11, !dbg !54 + %368 = extractelement <16 x float> %355, i64 12, !dbg !54 + %369 = extractelement <16 x float> %355, i64 13, !dbg !54 + %370 = extractelement <16 x float> %355, i64 14, !dbg !54 + %371 = extractelement <16 x float> %355, i64 15, !dbg !54 + %372 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %311, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %373 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %313, <8 x half> %218, <16 x float> %372, i32 0, i32 0, i32 0), !dbg !54 + %374 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %315, <8 x half> %220, <16 x float> %373, i32 0, i32 0, i32 0), !dbg !54 + %375 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %317, <8 x half> %222, <16 x float> %374, i32 0, i32 0, i32 0), !dbg !54 + %376 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %319, <8 x half> %224, <16 x float> %375, i32 0, i32 0, i32 0), !dbg !54 + %377 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %321, <8 x half> %226, <16 x float> %376, i32 0, i32 0, i32 0), !dbg !54 + %378 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %323, <8 x half> %228, <16 x float> %377, i32 0, i32 0, i32 0), !dbg !54 + %379 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %325, <8 x half> %230, <16 x float> %378, i32 0, i32 0, i32 0), !dbg !54 + %380 = extractelement <16 x float> %379, i64 0, !dbg !54 + %381 = extractelement <16 x float> %379, i64 1, !dbg !54 + %382 = extractelement <16 x float> %379, i64 2, !dbg !54 + %383 = extractelement <16 x float> %379, i64 3, !dbg !54 + %384 = extractelement <16 x float> %379, i64 4, !dbg !54 + %385 = extractelement <16 x float> %379, i64 5, !dbg !54 + %386 = extractelement <16 x float> %379, i64 6, !dbg !54 + %387 = extractelement <16 x float> %379, i64 7, !dbg !54 + %388 = extractelement <16 x float> %379, i64 8, !dbg !54 + %389 = extractelement <16 x float> %379, i64 9, !dbg !54 + %390 = extractelement <16 x float> %379, i64 10, !dbg !54 + %391 = extractelement <16 x float> %379, i64 11, !dbg !54 + %392 = extractelement <16 x float> %379, i64 12, !dbg !54 + %393 = extractelement <16 x float> %379, i64 13, !dbg !54 + %394 = extractelement <16 x float> %379, i64 14, !dbg !54 + %395 = extractelement <16 x float> %379, i64 15, !dbg !54 + %396 = getelementptr half, ptr addrspace(1) %259, i64 %258, !dbg !51 + %397 = sext i32 %125 to i64, !dbg !55 + %398 = getelementptr half, ptr addrspace(1) %110, i64 %397, !dbg !55 + %399 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %396, i16 %237, i32 2147483646, i32 159744), !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %399, ptr addrspace(3) %232, i32 16, i32 %248, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %399, ptr addrspace(3) nonnull %234, i32 16, i32 %257, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %400 = tail call float @llvm.maxnum.f32(float %356, float %357), !dbg !56 + %401 = tail call float @llvm.maxnum.f32(float %400, float %358), !dbg !56 + %402 = tail call float @llvm.maxnum.f32(float %401, float %359), !dbg !56 + %403 = tail call float @llvm.maxnum.f32(float %402, float %360), !dbg !56 + %404 = tail call float @llvm.maxnum.f32(float %403, float %361), !dbg !56 + %405 = tail call float @llvm.maxnum.f32(float %404, float %362), !dbg !56 + %406 = tail call float @llvm.maxnum.f32(float %405, float %363), !dbg !56 + %407 = tail call float @llvm.maxnum.f32(float %406, float %364), !dbg !56 + %408 = tail call float @llvm.maxnum.f32(float %407, float %365), !dbg !56 + %409 = tail call float @llvm.maxnum.f32(float %408, float %366), !dbg !56 + %410 = tail call float @llvm.maxnum.f32(float %409, float %367), !dbg !56 + %411 = tail call float @llvm.maxnum.f32(float %410, float %368), !dbg !56 + %412 = tail call float @llvm.maxnum.f32(float %411, float %369), !dbg !56 + %413 = tail call float @llvm.maxnum.f32(float %412, float %370), !dbg !56 + %414 = tail call float @llvm.maxnum.f32(float %413, float %371), !dbg !56 + %415 = tail call float @llvm.maxnum.f32(float %414, float %380), !dbg !56 + %416 = tail call float @llvm.maxnum.f32(float %415, float %381), !dbg !56 + %417 = tail call float @llvm.maxnum.f32(float %416, float %382), !dbg !56 + %418 = tail call float @llvm.maxnum.f32(float %417, float %383), !dbg !56 + %419 = tail call float @llvm.maxnum.f32(float %418, float %384), !dbg !56 + %420 = tail call float @llvm.maxnum.f32(float %419, float %385), !dbg !56 + %421 = tail call float @llvm.maxnum.f32(float %420, float %386), !dbg !56 + %422 = tail call float @llvm.maxnum.f32(float %421, float %387), !dbg !56 + %423 = tail call float @llvm.maxnum.f32(float %422, float %388), !dbg !56 + %424 = tail call float @llvm.maxnum.f32(float %423, float %389), !dbg !56 + %425 = tail call float @llvm.maxnum.f32(float %424, float %390), !dbg !56 + %426 = tail call float @llvm.maxnum.f32(float %425, float %391), !dbg !56 + %427 = tail call float @llvm.maxnum.f32(float %426, float %392), !dbg !56 + %428 = tail call float @llvm.maxnum.f32(float %427, float %393), !dbg !56 + %429 = tail call float @llvm.maxnum.f32(float %428, float %394), !dbg !56 + %430 = tail call float @llvm.maxnum.f32(float %429, float %395), !dbg !56 + %431 = bitcast float %430 to i32, !dbg !59 + %432 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %431, i32 %431, i1 false, i1 false), !dbg !59 + %433 = extractvalue { i32, i32 } %432, 0, !dbg !59 + %434 = extractvalue { i32, i32 } %432, 1, !dbg !59 + %435 = bitcast i32 %433 to float, !dbg !59 + %436 = bitcast i32 %434 to float, !dbg !59 + %437 = tail call float @llvm.maxnum.f32(float %435, float %436), !dbg !56 + %438 = tail call float @llvm.maxnum.f32(float %437, float 0xFFF0000000000000), !dbg !60 + %439 = fmul float %438, 0x3FC0527DC0000000, !dbg !61 + %440 = fmul float %356, 0x3FC0527DC0000000, !dbg !62 + %441 = fmul float %357, 0x3FC0527DC0000000, !dbg !62 + %442 = fmul float %358, 0x3FC0527DC0000000, !dbg !62 + %443 = fmul float %359, 0x3FC0527DC0000000, !dbg !62 + %444 = fmul float %360, 0x3FC0527DC0000000, !dbg !62 + %445 = fmul float %361, 0x3FC0527DC0000000, !dbg !62 + %446 = fmul float %362, 0x3FC0527DC0000000, !dbg !62 + %447 = fmul float %363, 0x3FC0527DC0000000, !dbg !62 + %448 = fmul float %364, 0x3FC0527DC0000000, !dbg !62 + %449 = fmul float %365, 0x3FC0527DC0000000, !dbg !62 + %450 = fmul float %366, 0x3FC0527DC0000000, !dbg !62 + %451 = fmul float %367, 0x3FC0527DC0000000, !dbg !62 + %452 = fmul float %368, 0x3FC0527DC0000000, !dbg !62 + %453 = fmul float %369, 0x3FC0527DC0000000, !dbg !62 + %454 = fmul float %370, 0x3FC0527DC0000000, !dbg !62 + %455 = fmul float %371, 0x3FC0527DC0000000, !dbg !62 + %456 = fmul float %380, 0x3FC0527DC0000000, !dbg !62 + %457 = fmul float %381, 0x3FC0527DC0000000, !dbg !62 + %458 = fmul float %382, 0x3FC0527DC0000000, !dbg !62 + %459 = fmul float %383, 0x3FC0527DC0000000, !dbg !62 + %460 = fmul float %384, 0x3FC0527DC0000000, !dbg !62 + %461 = fmul float %385, 0x3FC0527DC0000000, !dbg !62 + %462 = fmul float %386, 0x3FC0527DC0000000, !dbg !62 + %463 = fmul float %387, 0x3FC0527DC0000000, !dbg !62 + %464 = fmul float %388, 0x3FC0527DC0000000, !dbg !62 + %465 = fmul float %389, 0x3FC0527DC0000000, !dbg !62 + %466 = fmul float %390, 0x3FC0527DC0000000, !dbg !62 + %467 = fmul float %391, 0x3FC0527DC0000000, !dbg !62 + %468 = fmul float %392, 0x3FC0527DC0000000, !dbg !62 + %469 = fmul float %393, 0x3FC0527DC0000000, !dbg !62 + %470 = fmul float %394, 0x3FC0527DC0000000, !dbg !62 + %471 = fmul float %395, 0x3FC0527DC0000000, !dbg !62 + %472 = fsub float %440, %439, !dbg !63 + %473 = fsub float %441, %439, !dbg !63 + %474 = fsub float %442, %439, !dbg !63 + %475 = fsub float %443, %439, !dbg !63 + %476 = fsub float %444, %439, !dbg !63 + %477 = fsub float %445, %439, !dbg !63 + %478 = fsub float %446, %439, !dbg !63 + %479 = fsub float %447, %439, !dbg !63 + %480 = fsub float %448, %439, !dbg !63 + %481 = fsub float %449, %439, !dbg !63 + %482 = fsub float %450, %439, !dbg !63 + %483 = fsub float %451, %439, !dbg !63 + %484 = fsub float %452, %439, !dbg !63 + %485 = fsub float %453, %439, !dbg !63 + %486 = fsub float %454, %439, !dbg !63 + %487 = fsub float %455, %439, !dbg !63 + %488 = fsub float %456, %439, !dbg !63 + %489 = fsub float %457, %439, !dbg !63 + %490 = fsub float %458, %439, !dbg !63 + %491 = fsub float %459, %439, !dbg !63 + %492 = fsub float %460, %439, !dbg !63 + %493 = fsub float %461, %439, !dbg !63 + %494 = fsub float %462, %439, !dbg !63 + %495 = fsub float %463, %439, !dbg !63 + %496 = fsub float %464, %439, !dbg !63 + %497 = fsub float %465, %439, !dbg !63 + %498 = fsub float %466, %439, !dbg !63 + %499 = fsub float %467, %439, !dbg !63 + %500 = fsub float %468, %439, !dbg !63 + %501 = fsub float %469, %439, !dbg !63 + %502 = fsub float %470, %439, !dbg !63 + %503 = fsub float %471, %439, !dbg !63 + %504 = tail call float @llvm.amdgcn.exp2.f32(float %472), !dbg !64 + %505 = tail call float @llvm.amdgcn.exp2.f32(float %473), !dbg !64 + %506 = tail call float @llvm.amdgcn.exp2.f32(float %474), !dbg !64 + %507 = tail call float @llvm.amdgcn.exp2.f32(float %475), !dbg !64 + %508 = tail call float @llvm.amdgcn.exp2.f32(float %476), !dbg !64 + %509 = tail call float @llvm.amdgcn.exp2.f32(float %477), !dbg !64 + %510 = tail call float @llvm.amdgcn.exp2.f32(float %478), !dbg !64 + %511 = tail call float @llvm.amdgcn.exp2.f32(float %479), !dbg !64 + %512 = tail call float @llvm.amdgcn.exp2.f32(float %480), !dbg !64 + %513 = tail call float @llvm.amdgcn.exp2.f32(float %481), !dbg !64 + %514 = tail call float @llvm.amdgcn.exp2.f32(float %482), !dbg !64 + %515 = tail call float @llvm.amdgcn.exp2.f32(float %483), !dbg !64 + %516 = tail call float @llvm.amdgcn.exp2.f32(float %484), !dbg !64 + %517 = tail call float @llvm.amdgcn.exp2.f32(float %485), !dbg !64 + %518 = tail call float @llvm.amdgcn.exp2.f32(float %486), !dbg !64 + %519 = tail call float @llvm.amdgcn.exp2.f32(float %487), !dbg !64 + %520 = tail call float @llvm.amdgcn.exp2.f32(float %488), !dbg !64 + %521 = tail call float @llvm.amdgcn.exp2.f32(float %489), !dbg !64 + %522 = tail call float @llvm.amdgcn.exp2.f32(float %490), !dbg !64 + %523 = tail call float @llvm.amdgcn.exp2.f32(float %491), !dbg !64 + %524 = tail call float @llvm.amdgcn.exp2.f32(float %492), !dbg !64 + %525 = tail call float @llvm.amdgcn.exp2.f32(float %493), !dbg !64 + %526 = tail call float @llvm.amdgcn.exp2.f32(float %494), !dbg !64 + %527 = tail call float @llvm.amdgcn.exp2.f32(float %495), !dbg !64 + %528 = tail call float @llvm.amdgcn.exp2.f32(float %496), !dbg !64 + %529 = tail call float @llvm.amdgcn.exp2.f32(float %497), !dbg !64 + %530 = tail call float @llvm.amdgcn.exp2.f32(float %498), !dbg !64 + %531 = tail call float @llvm.amdgcn.exp2.f32(float %499), !dbg !64 + %532 = tail call float @llvm.amdgcn.exp2.f32(float %500), !dbg !64 + %533 = tail call float @llvm.amdgcn.exp2.f32(float %501), !dbg !64 + %534 = tail call float @llvm.amdgcn.exp2.f32(float %502), !dbg !64 + %535 = tail call float @llvm.amdgcn.exp2.f32(float %503), !dbg !64 + %536 = fsub float 0xFFF0000000000000, %439, !dbg !65 + %537 = tail call float @llvm.amdgcn.exp2.f32(float %536), !dbg !66 + tail call void @llvm.amdgcn.s.waitcnt(i32 -49164), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %277, !dbg !47 + %539 = load <8 x half>, ptr addrspace(3) %538, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %540 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %278, !dbg !47 + %541 = load <8 x half>, ptr addrspace(3) %540, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %542 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %279, !dbg !47 + %543 = load <8 x half>, ptr addrspace(3) %542, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %544 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %280, !dbg !47 + %545 = load <8 x half>, ptr addrspace(3) %544, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %546 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %281, !dbg !47 + %547 = load <8 x half>, ptr addrspace(3) %546, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %548 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %282, !dbg !47 + %549 = load <8 x half>, ptr addrspace(3) %548, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %550 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %283, !dbg !47 + %551 = load <8 x half>, ptr addrspace(3) %550, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %552 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %284, !dbg !47 + %553 = load <8 x half>, ptr addrspace(3) %552, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %554 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %286, !dbg !47 + %555 = load <8 x half>, ptr addrspace(3) %554, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %556 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %287, !dbg !47 + %557 = load <8 x half>, ptr addrspace(3) %556, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %558 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %288, !dbg !47 + %559 = load <8 x half>, ptr addrspace(3) %558, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %560 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %289, !dbg !47 + %561 = load <8 x half>, ptr addrspace(3) %560, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %562 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %290, !dbg !47 + %563 = load <8 x half>, ptr addrspace(3) %562, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %564 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %291, !dbg !47 + %565 = load <8 x half>, ptr addrspace(3) %564, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %566 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %292, !dbg !47 + %567 = load <8 x half>, ptr addrspace(3) %566, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %568 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %293, !dbg !47 + %569 = load <8 x half>, ptr addrspace(3) %568, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %570 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %.idx2, !dbg !47 + %571 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %.idx4, !dbg !47 + %572 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %398, i16 %333, i32 2147483646, i32 159744), !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %572, ptr addrspace(3) %570, i32 16, i32 %344, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %572, ptr addrspace(3) nonnull %571, i32 16, i32 %347, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + fence syncscope("workgroup") release, !dbg !67 + tail call void @llvm.amdgcn.s.barrier(), !dbg !67 + fence syncscope("workgroup") acquire, !dbg !67 + %.off = add i32 %33, 255, !dbg !68 + %573 = icmp ult i32 %.off, 511, !dbg !68 + br i1 %573, label %575, label %574, !dbg !69 + +574: ; preds = %28 + tail call void @llvm.amdgcn.s.barrier(), !dbg !69 + br label %575, !dbg !69 + +575: ; preds = %574, %28 + %576 = shl i32 %33, 2 + %577 = and i32 %576, 12 + %578 = or disjoint i32 %577, %84 + %579 = and i32 %186, 3 + %580 = or disjoint i32 %85, %35 + %581 = or disjoint i32 %580, %578 + %582 = or disjoint i32 %579, %82 + %583 = shl nuw nsw i32 %582, 7 + %584 = or disjoint i32 %581, %583 + %585 = or disjoint i32 %583, 1024 + %586 = or disjoint i32 %581, %585 + %587 = or disjoint i32 %583, 2048 + %588 = or disjoint i32 %581, %587 + %589 = or disjoint i32 %583, 3072 + %590 = or disjoint i32 %581, %589 + %591 = or disjoint i32 %583, 4096 + %592 = or disjoint i32 %581, %591 + %593 = or disjoint i32 %583, 5120 + %594 = or disjoint i32 %581, %593 + %595 = or disjoint i32 %583, 6144 + %596 = or disjoint i32 %581, %595 + %597 = or disjoint i32 %583, 7168 + %598 = or disjoint i32 %581, %597 + %599 = or disjoint i32 %577, 32 + %600 = xor i32 %599, %84 + %601 = or disjoint i32 %35, %600 + %602 = or disjoint i32 %601, %85 + %603 = or disjoint i32 %578, 64 + %604 = xor i32 %603, %85 + %605 = or disjoint i32 %604, %35 + %606 = or disjoint i32 %577, 96 + %607 = and i32 %83, 96 + %608 = xor i32 %607, %606 + %609 = or disjoint i32 %608, %35 + %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> + %611 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> + %612 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> + %613 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> + %614 = shufflevector <8 x half> %541, <8 x half> poison, <2 x i32> + %615 = shufflevector <8 x half> %541, <8 x half> poison, <2 x i32> + %616 = shufflevector <8 x half> %541, <8 x half> poison, <2 x i32> + %617 = shufflevector <8 x half> %541, <8 x half> poison, <2 x i32> + %618 = shufflevector <8 x half> %543, <8 x half> poison, <2 x i32> + %619 = shufflevector <8 x half> %543, <8 x half> poison, <2 x i32> + %620 = shufflevector <8 x half> %543, <8 x half> poison, <2 x i32> + %621 = shufflevector <8 x half> %543, <8 x half> poison, <2 x i32> + %622 = shufflevector <8 x half> %545, <8 x half> poison, <2 x i32> + %623 = shufflevector <8 x half> %545, <8 x half> poison, <2 x i32> + %624 = shufflevector <8 x half> %545, <8 x half> poison, <2 x i32> + %625 = shufflevector <8 x half> %545, <8 x half> poison, <2 x i32> + %626 = shufflevector <8 x half> %547, <8 x half> poison, <2 x i32> + %627 = shufflevector <8 x half> %547, <8 x half> poison, <2 x i32> + %628 = shufflevector <8 x half> %547, <8 x half> poison, <2 x i32> + %629 = shufflevector <8 x half> %547, <8 x half> poison, <2 x i32> + %630 = shufflevector <8 x half> %549, <8 x half> poison, <2 x i32> + %631 = shufflevector <8 x half> %549, <8 x half> poison, <2 x i32> + %632 = shufflevector <8 x half> %549, <8 x half> poison, <2 x i32> + %633 = shufflevector <8 x half> %549, <8 x half> poison, <2 x i32> + %634 = shufflevector <8 x half> %551, <8 x half> poison, <2 x i32> + %635 = shufflevector <8 x half> %551, <8 x half> poison, <2 x i32> + %636 = shufflevector <8 x half> %551, <8 x half> poison, <2 x i32> + %637 = shufflevector <8 x half> %551, <8 x half> poison, <2 x i32> + %638 = shufflevector <8 x half> %553, <8 x half> poison, <2 x i32> + %639 = shufflevector <8 x half> %553, <8 x half> poison, <2 x i32> + %640 = shufflevector <8 x half> %553, <8 x half> poison, <2 x i32> + %641 = shufflevector <8 x half> %553, <8 x half> poison, <2 x i32> + %642 = shufflevector <8 x half> %555, <8 x half> poison, <2 x i32> + %643 = shufflevector <8 x half> %555, <8 x half> poison, <2 x i32> + %644 = shufflevector <8 x half> %555, <8 x half> poison, <2 x i32> + %645 = shufflevector <8 x half> %555, <8 x half> poison, <2 x i32> + %646 = shufflevector <8 x half> %557, <8 x half> poison, <2 x i32> + %647 = shufflevector <8 x half> %557, <8 x half> poison, <2 x i32> + %648 = shufflevector <8 x half> %557, <8 x half> poison, <2 x i32> + %649 = shufflevector <8 x half> %557, <8 x half> poison, <2 x i32> + %650 = shufflevector <8 x half> %559, <8 x half> poison, <2 x i32> + %651 = shufflevector <8 x half> %559, <8 x half> poison, <2 x i32> + %652 = shufflevector <8 x half> %559, <8 x half> poison, <2 x i32> + %653 = shufflevector <8 x half> %559, <8 x half> poison, <2 x i32> + %654 = shufflevector <8 x half> %561, <8 x half> poison, <2 x i32> + %655 = shufflevector <8 x half> %561, <8 x half> poison, <2 x i32> + %656 = shufflevector <8 x half> %561, <8 x half> poison, <2 x i32> + %657 = shufflevector <8 x half> %561, <8 x half> poison, <2 x i32> + %658 = shufflevector <8 x half> %563, <8 x half> poison, <2 x i32> + %659 = shufflevector <8 x half> %563, <8 x half> poison, <2 x i32> + %660 = shufflevector <8 x half> %563, <8 x half> poison, <2 x i32> + %661 = shufflevector <8 x half> %563, <8 x half> poison, <2 x i32> + %662 = shufflevector <8 x half> %565, <8 x half> poison, <2 x i32> + %663 = shufflevector <8 x half> %565, <8 x half> poison, <2 x i32> + %664 = shufflevector <8 x half> %565, <8 x half> poison, <2 x i32> + %665 = shufflevector <8 x half> %565, <8 x half> poison, <2 x i32> + %666 = shufflevector <8 x half> %567, <8 x half> poison, <2 x i32> + %667 = shufflevector <8 x half> %567, <8 x half> poison, <2 x i32> + %668 = shufflevector <8 x half> %567, <8 x half> poison, <2 x i32> + %669 = shufflevector <8 x half> %567, <8 x half> poison, <2 x i32> + %670 = shufflevector <8 x half> %569, <8 x half> poison, <2 x i32> + %671 = shufflevector <8 x half> %569, <8 x half> poison, <2 x i32> + %672 = shufflevector <8 x half> %569, <8 x half> poison, <2 x i32> + %673 = shufflevector <8 x half> %569, <8 x half> poison, <2 x i32> + %674 = shl nuw nsw i32 %36, 3 + %675 = and i32 %674, 120 + %676 = xor i32 %675, %187 + %677 = shl nuw nsw i32 %36, 7 + %678 = xor i32 %675, %189 + %679 = xor i32 %675, %190 + %680 = xor i32 %675, %191 + %681 = xor i32 %675, %192 + %682 = xor i32 %675, %193 + %683 = xor i32 %675, %194 + %684 = xor i32 %675, %195 + %685 = or disjoint i32 %677, 4096 + br label %686, !dbg !70 + +686: ; preds = %575, %686 + %687 = phi ptr addrspace(3) [ @global_smem, %575 ], [ %1121, %686 ] + %688 = phi ptr addrspace(3) [ getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), %575 ], [ %1404, %686 ] + %689 = phi ptr addrspace(3) [ getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), %575 ], [ %688, %686 ] + %.pn347561 = phi float [ %537, %575 ], [ %1363, %686 ] + %.pn283560 = phi float [ %535, %575 ], [ %1360, %686 ] + %.pn285559 = phi float [ %534, %575 ], [ %1359, %686 ] + %.pn287558 = phi float [ %533, %575 ], [ %1358, %686 ] + %.pn289557 = phi float [ %532, %575 ], [ %1357, %686 ] + %.pn291556 = phi float [ %531, %575 ], [ %1356, %686 ] + %.pn293555 = phi float [ %530, %575 ], [ %1355, %686 ] + %.pn295554 = phi float [ %529, %575 ], [ %1354, %686 ] + %.pn297553 = phi float [ %528, %575 ], [ %1353, %686 ] + %.pn299552 = phi float [ %527, %575 ], [ %1352, %686 ] + %.pn301551 = phi float [ %526, %575 ], [ %1351, %686 ] + %.pn303550 = phi float [ %525, %575 ], [ %1350, %686 ] + %.pn305549 = phi float [ %524, %575 ], [ %1349, %686 ] + %.pn307548 = phi float [ %523, %575 ], [ %1348, %686 ] + %.pn309547 = phi float [ %522, %575 ], [ %1347, %686 ] + %.pn311546 = phi float [ %521, %575 ], [ %1346, %686 ] + %.pn313545 = phi float [ %520, %575 ], [ %1345, %686 ] + %.pn315544 = phi float [ %519, %575 ], [ %1344, %686 ] + %.pn317543 = phi float [ %518, %575 ], [ %1343, %686 ] + %.pn319542 = phi float [ %517, %575 ], [ %1342, %686 ] + %.pn321541 = phi float [ %516, %575 ], [ %1341, %686 ] + %.pn323540 = phi float [ %515, %575 ], [ %1340, %686 ] + %.pn325539 = phi float [ %514, %575 ], [ %1339, %686 ] + %.pn327538 = phi float [ %513, %575 ], [ %1338, %686 ] + %.pn329537 = phi float [ %512, %575 ], [ %1337, %686 ] + %.pn331536 = phi float [ %511, %575 ], [ %1336, %686 ] + %.pn333535 = phi float [ %510, %575 ], [ %1335, %686 ] + %.pn335534 = phi float [ %509, %575 ], [ %1334, %686 ] + %.pn337533 = phi float [ %508, %575 ], [ %1333, %686 ] + %.pn339532 = phi float [ %507, %575 ], [ %1332, %686 ] + %.pn341531 = phi float [ %506, %575 ], [ %1331, %686 ] + %.pn343530 = phi float [ %505, %575 ], [ %1330, %686 ] + %.pn345529 = phi float [ %504, %575 ], [ %1329, %686 ] + %690 = phi i32 [ 0, %575 ], [ %1120, %686 ] + %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ] + %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ] + %.pn26400 = phi float [ %438, %575 ], [ %1263, %686 ] + %693 = phi float [ 1.000000e+00, %575 ], [ %992, %686 ] + %694 = phi i32 [ 0, %575 ], [ %1412, %686 ] + %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ] + %696 = phi <2 x half> [ %611, %575 ], [ %1415, %686 ] + %697 = phi <2 x half> [ %612, %575 ], [ %1416, %686 ] + %698 = phi <2 x half> [ %613, %575 ], [ %1417, %686 ] + %699 = phi <2 x half> [ %614, %575 ], [ %1418, %686 ] + %700 = phi <2 x half> [ %615, %575 ], [ %1419, %686 ] + %701 = phi <2 x half> [ %616, %575 ], [ %1420, %686 ] + %702 = phi <2 x half> [ %617, %575 ], [ %1421, %686 ] + %703 = phi <2 x half> [ %618, %575 ], [ %1422, %686 ] + %704 = phi <2 x half> [ %619, %575 ], [ %1423, %686 ] + %705 = phi <2 x half> [ %620, %575 ], [ %1424, %686 ] + %706 = phi <2 x half> [ %621, %575 ], [ %1425, %686 ] + %707 = phi <2 x half> [ %622, %575 ], [ %1426, %686 ] + %708 = phi <2 x half> [ %623, %575 ], [ %1427, %686 ] + %709 = phi <2 x half> [ %624, %575 ], [ %1428, %686 ] + %710 = phi <2 x half> [ %625, %575 ], [ %1429, %686 ] + %711 = phi <2 x half> [ %626, %575 ], [ %1430, %686 ] + %712 = phi <2 x half> [ %627, %575 ], [ %1431, %686 ] + %713 = phi <2 x half> [ %628, %575 ], [ %1432, %686 ] + %714 = phi <2 x half> [ %629, %575 ], [ %1433, %686 ] + %715 = phi <2 x half> [ %630, %575 ], [ %1434, %686 ] + %716 = phi <2 x half> [ %631, %575 ], [ %1435, %686 ] + %717 = phi <2 x half> [ %632, %575 ], [ %1436, %686 ] + %718 = phi <2 x half> [ %633, %575 ], [ %1437, %686 ] + %719 = phi <2 x half> [ %634, %575 ], [ %1438, %686 ] + %720 = phi <2 x half> [ %635, %575 ], [ %1439, %686 ] + %721 = phi <2 x half> [ %636, %575 ], [ %1440, %686 ] + %722 = phi <2 x half> [ %637, %575 ], [ %1441, %686 ] + %723 = phi <2 x half> [ %638, %575 ], [ %1442, %686 ] + %724 = phi <2 x half> [ %639, %575 ], [ %1443, %686 ] + %725 = phi <2 x half> [ %640, %575 ], [ %1444, %686 ] + %726 = phi <2 x half> [ %641, %575 ], [ %1445, %686 ] + %727 = phi <2 x half> [ %642, %575 ], [ %1446, %686 ] + %728 = phi <2 x half> [ %643, %575 ], [ %1447, %686 ] + %729 = phi <2 x half> [ %644, %575 ], [ %1448, %686 ] + %730 = phi <2 x half> [ %645, %575 ], [ %1449, %686 ] + %731 = phi <2 x half> [ %646, %575 ], [ %1450, %686 ] + %732 = phi <2 x half> [ %647, %575 ], [ %1451, %686 ] + %733 = phi <2 x half> [ %648, %575 ], [ %1452, %686 ] + %734 = phi <2 x half> [ %649, %575 ], [ %1453, %686 ] + %735 = phi <2 x half> [ %650, %575 ], [ %1454, %686 ] + %736 = phi <2 x half> [ %651, %575 ], [ %1455, %686 ] + %737 = phi <2 x half> [ %652, %575 ], [ %1456, %686 ] + %738 = phi <2 x half> [ %653, %575 ], [ %1457, %686 ] + %739 = phi <2 x half> [ %654, %575 ], [ %1458, %686 ] + %740 = phi <2 x half> [ %655, %575 ], [ %1459, %686 ] + %741 = phi <2 x half> [ %656, %575 ], [ %1460, %686 ] + %742 = phi <2 x half> [ %657, %575 ], [ %1461, %686 ] + %743 = phi <2 x half> [ %658, %575 ], [ %1462, %686 ] + %744 = phi <2 x half> [ %659, %575 ], [ %1463, %686 ] + %745 = phi <2 x half> [ %660, %575 ], [ %1464, %686 ] + %746 = phi <2 x half> [ %661, %575 ], [ %1465, %686 ] + %747 = phi <2 x half> [ %662, %575 ], [ %1466, %686 ] + %748 = phi <2 x half> [ %663, %575 ], [ %1467, %686 ] + %749 = phi <2 x half> [ %664, %575 ], [ %1468, %686 ] + %750 = phi <2 x half> [ %665, %575 ], [ %1469, %686 ] + %751 = phi <2 x half> [ %666, %575 ], [ %1470, %686 ] + %752 = phi <2 x half> [ %667, %575 ], [ %1471, %686 ] + %753 = phi <2 x half> [ %668, %575 ], [ %1472, %686 ] + %754 = phi <2 x half> [ %669, %575 ], [ %1473, %686 ] + %755 = phi <2 x half> [ %670, %575 ], [ %1474, %686 ] + %756 = phi <2 x half> [ %671, %575 ], [ %1475, %686 ] + %757 = phi <2 x half> [ %672, %575 ], [ %1476, %686 ] + %758 = phi <2 x half> [ %673, %575 ], [ %1477, %686 ] + %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ] + %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1479, %686 ] + %761 = phi <2 x float> [ zeroinitializer, %575 ], [ %1480, %686 ] + %762 = phi <2 x float> [ zeroinitializer, %575 ], [ %1481, %686 ] + %763 = phi <2 x float> [ zeroinitializer, %575 ], [ %1482, %686 ] + %764 = phi <2 x float> [ zeroinitializer, %575 ], [ %1483, %686 ] + %765 = phi <2 x float> [ zeroinitializer, %575 ], [ %1484, %686 ] + %766 = phi <2 x float> [ zeroinitializer, %575 ], [ %1485, %686 ] + %767 = phi <2 x float> [ zeroinitializer, %575 ], [ %1486, %686 ] + %768 = phi <2 x float> [ zeroinitializer, %575 ], [ %1487, %686 ] + %769 = phi <2 x float> [ zeroinitializer, %575 ], [ %1488, %686 ] + %770 = phi <2 x float> [ zeroinitializer, %575 ], [ %1489, %686 ] + %771 = phi <2 x float> [ zeroinitializer, %575 ], [ %1490, %686 ] + %772 = phi <2 x float> [ zeroinitializer, %575 ], [ %1491, %686 ] + %773 = phi <2 x float> [ zeroinitializer, %575 ], [ %1492, %686 ] + %774 = phi <2 x float> [ zeroinitializer, %575 ], [ %1493, %686 ] + %775 = phi <2 x float> [ zeroinitializer, %575 ], [ %1494, %686 ] + %776 = phi <2 x float> [ zeroinitializer, %575 ], [ %1495, %686 ] + %777 = phi <2 x float> [ zeroinitializer, %575 ], [ %1496, %686 ] + %778 = phi <2 x float> [ zeroinitializer, %575 ], [ %1497, %686 ] + %779 = phi <2 x float> [ zeroinitializer, %575 ], [ %1498, %686 ] + %780 = phi <2 x float> [ zeroinitializer, %575 ], [ %1499, %686 ] + %781 = phi <2 x float> [ zeroinitializer, %575 ], [ %1500, %686 ] + %782 = phi <2 x float> [ zeroinitializer, %575 ], [ %1501, %686 ] + %783 = phi <2 x float> [ zeroinitializer, %575 ], [ %1502, %686 ] + %784 = phi <2 x float> [ zeroinitializer, %575 ], [ %1503, %686 ] + %785 = phi <2 x float> [ zeroinitializer, %575 ], [ %1504, %686 ] + %786 = phi <2 x float> [ zeroinitializer, %575 ], [ %1505, %686 ] + %787 = phi <2 x float> [ zeroinitializer, %575 ], [ %1506, %686 ] + %788 = phi <2 x float> [ zeroinitializer, %575 ], [ %1507, %686 ] + %789 = phi <2 x float> [ zeroinitializer, %575 ], [ %1508, %686 ] + %790 = phi <2 x float> [ zeroinitializer, %575 ], [ %1509, %686 ] + %791 = shufflevector <2 x half> %695, <2 x half> %696, <8 x i32> , !dbg !54 + %792 = shufflevector <2 x half> %697, <2 x half> poison, <8 x i32> , !dbg !54 + %793 = shufflevector <8 x half> %791, <8 x half> %792, <8 x i32> , !dbg !54 + %794 = shufflevector <2 x half> %698, <2 x half> poison, <8 x i32> , !dbg !54 + %795 = shufflevector <8 x half> %793, <8 x half> %794, <8 x i32> , !dbg !54 + %796 = shufflevector <2 x half> %699, <2 x half> %700, <8 x i32> , !dbg !54 + %797 = shufflevector <2 x half> %701, <2 x half> poison, <8 x i32> , !dbg !54 + %798 = shufflevector <8 x half> %796, <8 x half> %797, <8 x i32> , !dbg !54 + %799 = shufflevector <2 x half> %702, <2 x half> poison, <8 x i32> , !dbg !54 + %800 = shufflevector <8 x half> %798, <8 x half> %799, <8 x i32> , !dbg !54 + %801 = shufflevector <2 x half> %703, <2 x half> %704, <8 x i32> , !dbg !54 + %802 = shufflevector <2 x half> %705, <2 x half> poison, <8 x i32> , !dbg !54 + %803 = shufflevector <8 x half> %801, <8 x half> %802, <8 x i32> , !dbg !54 + %804 = shufflevector <2 x half> %706, <2 x half> poison, <8 x i32> , !dbg !54 + %805 = shufflevector <8 x half> %803, <8 x half> %804, <8 x i32> , !dbg !54 + %806 = shufflevector <2 x half> %707, <2 x half> %708, <8 x i32> , !dbg !54 + %807 = shufflevector <2 x half> %709, <2 x half> poison, <8 x i32> , !dbg !54 + %808 = shufflevector <8 x half> %806, <8 x half> %807, <8 x i32> , !dbg !54 + %809 = shufflevector <2 x half> %710, <2 x half> poison, <8 x i32> , !dbg !54 + %810 = shufflevector <8 x half> %808, <8 x half> %809, <8 x i32> , !dbg !54 + %811 = shufflevector <2 x half> %711, <2 x half> %712, <8 x i32> , !dbg !54 + %812 = shufflevector <2 x half> %713, <2 x half> poison, <8 x i32> , !dbg !54 + %813 = shufflevector <8 x half> %811, <8 x half> %812, <8 x i32> , !dbg !54 + %814 = shufflevector <2 x half> %714, <2 x half> poison, <8 x i32> , !dbg !54 + %815 = shufflevector <8 x half> %813, <8 x half> %814, <8 x i32> , !dbg !54 + %816 = shufflevector <2 x half> %715, <2 x half> %716, <8 x i32> , !dbg !54 + %817 = shufflevector <2 x half> %717, <2 x half> poison, <8 x i32> , !dbg !54 + %818 = shufflevector <8 x half> %816, <8 x half> %817, <8 x i32> , !dbg !54 + %819 = shufflevector <2 x half> %718, <2 x half> poison, <8 x i32> , !dbg !54 + %820 = shufflevector <8 x half> %818, <8 x half> %819, <8 x i32> , !dbg !54 + %821 = shufflevector <2 x half> %719, <2 x half> %720, <8 x i32> , !dbg !54 + %822 = shufflevector <2 x half> %721, <2 x half> poison, <8 x i32> , !dbg !54 + %823 = shufflevector <8 x half> %821, <8 x half> %822, <8 x i32> , !dbg !54 + %824 = shufflevector <2 x half> %722, <2 x half> poison, <8 x i32> , !dbg !54 + %825 = shufflevector <8 x half> %823, <8 x half> %824, <8 x i32> , !dbg !54 + %826 = shufflevector <2 x half> %723, <2 x half> %724, <8 x i32> , !dbg !54 + %827 = shufflevector <2 x half> %725, <2 x half> poison, <8 x i32> , !dbg !54 + %828 = shufflevector <8 x half> %826, <8 x half> %827, <8 x i32> , !dbg !54 + %829 = shufflevector <2 x half> %726, <2 x half> poison, <8 x i32> , !dbg !54 + %830 = shufflevector <8 x half> %828, <8 x half> %829, <8 x i32> , !dbg !54 + %831 = shufflevector <2 x half> %727, <2 x half> %728, <8 x i32> , !dbg !54 + %832 = shufflevector <2 x half> %729, <2 x half> poison, <8 x i32> , !dbg !54 + %833 = shufflevector <8 x half> %831, <8 x half> %832, <8 x i32> , !dbg !54 + %834 = shufflevector <2 x half> %730, <2 x half> poison, <8 x i32> , !dbg !54 + %835 = shufflevector <8 x half> %833, <8 x half> %834, <8 x i32> , !dbg !54 + %836 = shufflevector <2 x half> %731, <2 x half> %732, <8 x i32> , !dbg !54 + %837 = shufflevector <2 x half> %733, <2 x half> poison, <8 x i32> , !dbg !54 + %838 = shufflevector <8 x half> %836, <8 x half> %837, <8 x i32> , !dbg !54 + %839 = shufflevector <2 x half> %734, <2 x half> poison, <8 x i32> , !dbg !54 + %840 = shufflevector <8 x half> %838, <8 x half> %839, <8 x i32> , !dbg !54 + %841 = shufflevector <2 x half> %735, <2 x half> %736, <8 x i32> , !dbg !54 + %842 = shufflevector <2 x half> %737, <2 x half> poison, <8 x i32> , !dbg !54 + %843 = shufflevector <8 x half> %841, <8 x half> %842, <8 x i32> , !dbg !54 + %844 = shufflevector <2 x half> %738, <2 x half> poison, <8 x i32> , !dbg !54 + %845 = shufflevector <8 x half> %843, <8 x half> %844, <8 x i32> , !dbg !54 + %846 = shufflevector <2 x half> %739, <2 x half> %740, <8 x i32> , !dbg !54 + %847 = shufflevector <2 x half> %741, <2 x half> poison, <8 x i32> , !dbg !54 + %848 = shufflevector <8 x half> %846, <8 x half> %847, <8 x i32> , !dbg !54 + %849 = shufflevector <2 x half> %742, <2 x half> poison, <8 x i32> , !dbg !54 + %850 = shufflevector <8 x half> %848, <8 x half> %849, <8 x i32> , !dbg !54 + %851 = shufflevector <2 x half> %743, <2 x half> %744, <8 x i32> , !dbg !54 + %852 = shufflevector <2 x half> %745, <2 x half> poison, <8 x i32> , !dbg !54 + %853 = shufflevector <8 x half> %851, <8 x half> %852, <8 x i32> , !dbg !54 + %854 = shufflevector <2 x half> %746, <2 x half> poison, <8 x i32> , !dbg !54 + %855 = shufflevector <8 x half> %853, <8 x half> %854, <8 x i32> , !dbg !54 + %856 = shufflevector <2 x half> %747, <2 x half> %748, <8 x i32> , !dbg !54 + %857 = shufflevector <2 x half> %749, <2 x half> poison, <8 x i32> , !dbg !54 + %858 = shufflevector <8 x half> %856, <8 x half> %857, <8 x i32> , !dbg !54 + %859 = shufflevector <2 x half> %750, <2 x half> poison, <8 x i32> , !dbg !54 + %860 = shufflevector <8 x half> %858, <8 x half> %859, <8 x i32> , !dbg !54 + %861 = shufflevector <2 x half> %751, <2 x half> %752, <8 x i32> , !dbg !54 + %862 = shufflevector <2 x half> %753, <2 x half> poison, <8 x i32> , !dbg !54 + %863 = shufflevector <8 x half> %861, <8 x half> %862, <8 x i32> , !dbg !54 + %864 = shufflevector <2 x half> %754, <2 x half> poison, <8 x i32> , !dbg !54 + %865 = shufflevector <8 x half> %863, <8 x half> %864, <8 x i32> , !dbg !54 + %866 = shufflevector <2 x half> %755, <2 x half> %756, <8 x i32> , !dbg !54 + %867 = shufflevector <2 x half> %757, <2 x half> poison, <8 x i32> , !dbg !54 + %868 = shufflevector <8 x half> %866, <8 x half> %867, <8 x i32> , !dbg !54 + %869 = shufflevector <2 x half> %758, <2 x half> poison, <8 x i32> , !dbg !54 + %870 = shufflevector <8 x half> %868, <8 x half> %869, <8 x i32> , !dbg !54 + %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %795, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + tail call void @llvm.amdgcn.s.setprio(i16 0), !dbg !71 + %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %800, <8 x half> %218, <16 x float> %871, i32 0, i32 0, i32 0), !dbg !54 + %873 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %805, <8 x half> %220, <16 x float> %872, i32 0, i32 0, i32 0), !dbg !54 + %874 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %810, <8 x half> %222, <16 x float> %873, i32 0, i32 0, i32 0), !dbg !54 + %875 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %815, <8 x half> %224, <16 x float> %874, i32 0, i32 0, i32 0), !dbg !54 + %876 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %820, <8 x half> %226, <16 x float> %875, i32 0, i32 0, i32 0), !dbg !54 + %877 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %825, <8 x half> %228, <16 x float> %876, i32 0, i32 0, i32 0), !dbg !54 + %878 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %830, <8 x half> %230, <16 x float> %877, i32 0, i32 0, i32 0), !dbg !54 + %879 = extractelement <16 x float> %878, i64 0, !dbg !54 + %880 = extractelement <16 x float> %878, i64 1, !dbg !54 + %881 = extractelement <16 x float> %878, i64 2, !dbg !54 + %882 = extractelement <16 x float> %878, i64 3, !dbg !54 + %883 = extractelement <16 x float> %878, i64 4, !dbg !54 + %884 = extractelement <16 x float> %878, i64 5, !dbg !54 + %885 = extractelement <16 x float> %878, i64 6, !dbg !54 + %886 = extractelement <16 x float> %878, i64 7, !dbg !54 + %887 = extractelement <16 x float> %878, i64 8, !dbg !54 + %888 = extractelement <16 x float> %878, i64 9, !dbg !54 + %889 = extractelement <16 x float> %878, i64 10, !dbg !54 + %890 = extractelement <16 x float> %878, i64 11, !dbg !54 + %891 = extractelement <16 x float> %878, i64 12, !dbg !54 + %892 = extractelement <16 x float> %878, i64 13, !dbg !54 + %893 = extractelement <16 x float> %878, i64 14, !dbg !54 + %894 = extractelement <16 x float> %878, i64 15, !dbg !54 + %895 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %835, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %896 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %840, <8 x half> %218, <16 x float> %895, i32 0, i32 0, i32 0), !dbg !54 + %897 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %845, <8 x half> %220, <16 x float> %896, i32 0, i32 0, i32 0), !dbg !54 + %898 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %850, <8 x half> %222, <16 x float> %897, i32 0, i32 0, i32 0), !dbg !54 + %899 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %855, <8 x half> %224, <16 x float> %898, i32 0, i32 0, i32 0), !dbg !54 + %900 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %860, <8 x half> %226, <16 x float> %899, i32 0, i32 0, i32 0), !dbg !54 + %901 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %865, <8 x half> %228, <16 x float> %900, i32 0, i32 0, i32 0), !dbg !54 + %902 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %870, <8 x half> %230, <16 x float> %901, i32 0, i32 0, i32 0), !dbg !54 + %903 = extractelement <16 x float> %902, i64 0, !dbg !54 + %904 = extractelement <16 x float> %902, i64 1, !dbg !54 + %905 = extractelement <16 x float> %902, i64 2, !dbg !54 + %906 = extractelement <16 x float> %902, i64 3, !dbg !54 + %907 = extractelement <16 x float> %902, i64 4, !dbg !54 + %908 = extractelement <16 x float> %902, i64 5, !dbg !54 + %909 = extractelement <16 x float> %902, i64 6, !dbg !54 + %910 = extractelement <16 x float> %902, i64 7, !dbg !54 + %911 = extractelement <16 x float> %902, i64 8, !dbg !54 + %912 = extractelement <16 x float> %902, i64 9, !dbg !54 + %913 = extractelement <16 x float> %902, i64 10, !dbg !54 + %914 = extractelement <16 x float> %902, i64 11, !dbg !54 + %915 = extractelement <16 x float> %902, i64 12, !dbg !54 + %916 = extractelement <16 x float> %902, i64 13, !dbg !54 + %917 = extractelement <16 x float> %902, i64 14, !dbg !54 + %918 = extractelement <16 x float> %902, i64 15, !dbg !54 + %919 = fadd float %.pn345529, %.pn343530, !dbg !72 + %920 = fadd float %919, %.pn341531, !dbg !72 + %921 = fadd float %920, %.pn339532, !dbg !72 + %922 = fadd float %921, %.pn337533, !dbg !72 + %923 = fadd float %922, %.pn335534, !dbg !72 + %924 = fadd float %923, %.pn333535, !dbg !72 + %925 = fadd float %924, %.pn331536, !dbg !72 + %926 = fadd float %925, %.pn329537, !dbg !72 + %927 = fadd float %926, %.pn327538, !dbg !72 + %928 = fadd float %927, %.pn325539, !dbg !72 + %929 = fadd float %928, %.pn323540, !dbg !72 + %930 = fadd float %929, %.pn321541, !dbg !72 + %931 = fadd float %930, %.pn319542, !dbg !72 + %932 = fadd float %931, %.pn317543, !dbg !72 + %933 = fadd float %932, %.pn315544, !dbg !72 + %934 = fadd float %933, %.pn313545, !dbg !72 + %935 = fadd float %934, %.pn311546, !dbg !72 + %936 = fadd float %935, %.pn309547, !dbg !72 + %937 = fadd float %936, %.pn307548, !dbg !72 + %938 = fadd float %937, %.pn305549, !dbg !72 + %939 = fadd float %938, %.pn303550, !dbg !72 + %940 = fadd float %939, %.pn301551, !dbg !72 + %941 = fadd float %940, %.pn299552, !dbg !72 + %942 = fadd float %941, %.pn297553, !dbg !72 + %943 = fadd float %942, %.pn295554, !dbg !72 + %944 = fadd float %943, %.pn293555, !dbg !72 + %945 = fadd float %944, %.pn291556, !dbg !72 + %946 = fadd float %945, %.pn289557, !dbg !72 + %947 = fadd float %946, %.pn287558, !dbg !72 + %948 = fadd float %947, %.pn285559, !dbg !72 + %949 = fadd float %948, %.pn283560, !dbg !72 + %950 = bitcast float %949 to i32, !dbg !73 + %951 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %950, i32 %950, i1 false, i1 false), !dbg !73 + %952 = extractvalue { i32, i32 } %951, 0, !dbg !73 + %953 = extractvalue { i32, i32 } %951, 1, !dbg !73 + %954 = bitcast i32 %952 to float, !dbg !73 + %955 = bitcast i32 %953 to float, !dbg !73 + %956 = fadd float %954, %955, !dbg !72 + %957 = insertelement <2 x float> poison, float %.pn347561, i64 0, !dbg !74 + %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !74 + %959 = fmul <2 x float> %759, %958, !dbg !74 + %960 = fmul <2 x float> %760, %958, !dbg !74 + %961 = fmul <2 x float> %761, %958, !dbg !74 + %962 = fmul <2 x float> %762, %958, !dbg !74 + %963 = fmul <2 x float> %763, %958, !dbg !74 + %964 = fmul <2 x float> %764, %958, !dbg !74 + %965 = fmul <2 x float> %765, %958, !dbg !74 + %966 = fmul <2 x float> %766, %958, !dbg !74 + %967 = fmul <2 x float> %783, %958, !dbg !74 + %968 = fmul <2 x float> %784, %958, !dbg !74 + %969 = fmul <2 x float> %785, %958, !dbg !74 + %970 = fmul <2 x float> %786, %958, !dbg !74 + %971 = fmul <2 x float> %787, %958, !dbg !74 + %972 = fmul <2 x float> %788, %958, !dbg !74 + %973 = fmul <2 x float> %789, %958, !dbg !74 + %974 = fmul <2 x float> %790, %958, !dbg !74 + %975 = fmul <2 x float> %775, %958, !dbg !74 + %976 = fmul <2 x float> %776, %958, !dbg !74 + %977 = fmul <2 x float> %777, %958, !dbg !74 + %978 = fmul <2 x float> %778, %958, !dbg !74 + %979 = fmul <2 x float> %779, %958, !dbg !74 + %980 = fmul <2 x float> %780, %958, !dbg !74 + %981 = fmul <2 x float> %781, %958, !dbg !74 + %982 = fmul <2 x float> %782, %958, !dbg !74 + %983 = fmul <2 x float> %767, %958, !dbg !74 + %984 = fmul <2 x float> %768, %958, !dbg !74 + %985 = fmul <2 x float> %769, %958, !dbg !74 + %986 = fmul <2 x float> %770, %958, !dbg !74 + %987 = fmul <2 x float> %771, %958, !dbg !74 + %988 = fmul <2 x float> %772, %958, !dbg !74 + %989 = fmul <2 x float> %773, %958, !dbg !74 + %990 = fmul <2 x float> %774, %958, !dbg !74 + %991 = fmul float %693, %.pn347561, !dbg !75 + %992 = fadd float %991, %956, !dbg !76 + %993 = insertelement <2 x float> poison, float %.pn345529, i64 0, !dbg !77 + %994 = insertelement <2 x float> %993, float %.pn343530, i64 1, !dbg !77 + %995 = fptrunc <2 x float> %994 to <2 x half>, !dbg !77 + %996 = insertelement <2 x float> poison, float %.pn341531, i64 0, !dbg !77 + %997 = insertelement <2 x float> %996, float %.pn339532, i64 1, !dbg !77 + %998 = fptrunc <2 x float> %997 to <2 x half>, !dbg !77 + %999 = insertelement <2 x float> poison, float %.pn337533, i64 0, !dbg !77 + %1000 = insertelement <2 x float> %999, float %.pn335534, i64 1, !dbg !77 + %1001 = fptrunc <2 x float> %1000 to <2 x half>, !dbg !77 + %1002 = shufflevector <2 x half> %1001, <2 x half> poison, <8 x i32> + %1003 = insertelement <2 x float> poison, float %.pn333535, i64 0, !dbg !77 + %1004 = insertelement <2 x float> %1003, float %.pn331536, i64 1, !dbg !77 + %1005 = fptrunc <2 x float> %1004 to <2 x half>, !dbg !77 + %1006 = shufflevector <2 x half> %1005, <2 x half> poison, <8 x i32> + %1007 = insertelement <2 x float> poison, float %.pn329537, i64 0, !dbg !77 + %1008 = insertelement <2 x float> %1007, float %.pn327538, i64 1, !dbg !77 + %1009 = fptrunc <2 x float> %1008 to <2 x half>, !dbg !77 + %1010 = insertelement <2 x float> poison, float %.pn325539, i64 0, !dbg !77 + %1011 = insertelement <2 x float> %1010, float %.pn323540, i64 1, !dbg !77 + %1012 = fptrunc <2 x float> %1011 to <2 x half>, !dbg !77 + %1013 = insertelement <2 x float> poison, float %.pn321541, i64 0, !dbg !77 + %1014 = insertelement <2 x float> %1013, float %.pn319542, i64 1, !dbg !77 + %1015 = fptrunc <2 x float> %1014 to <2 x half>, !dbg !77 + %1016 = shufflevector <2 x half> %1015, <2 x half> poison, <8 x i32> + %1017 = insertelement <2 x float> poison, float %.pn317543, i64 0, !dbg !77 + %1018 = insertelement <2 x float> %1017, float %.pn315544, i64 1, !dbg !77 + %1019 = fptrunc <2 x float> %1018 to <2 x half>, !dbg !77 + %1020 = shufflevector <2 x half> %1019, <2 x half> poison, <8 x i32> + %1021 = insertelement <2 x float> poison, float %.pn313545, i64 0, !dbg !77 + %1022 = insertelement <2 x float> %1021, float %.pn311546, i64 1, !dbg !77 + %1023 = fptrunc <2 x float> %1022 to <2 x half>, !dbg !77 + %1024 = insertelement <2 x float> poison, float %.pn309547, i64 0, !dbg !77 + %1025 = insertelement <2 x float> %1024, float %.pn307548, i64 1, !dbg !77 + %1026 = fptrunc <2 x float> %1025 to <2 x half>, !dbg !77 + %1027 = insertelement <2 x float> poison, float %.pn305549, i64 0, !dbg !77 + %1028 = insertelement <2 x float> %1027, float %.pn303550, i64 1, !dbg !77 + %1029 = fptrunc <2 x float> %1028 to <2 x half>, !dbg !77 + %1030 = shufflevector <2 x half> %1029, <2 x half> poison, <8 x i32> + %1031 = insertelement <2 x float> poison, float %.pn301551, i64 0, !dbg !77 + %1032 = insertelement <2 x float> %1031, float %.pn299552, i64 1, !dbg !77 + %1033 = fptrunc <2 x float> %1032 to <2 x half>, !dbg !77 + %1034 = shufflevector <2 x half> %1033, <2 x half> poison, <8 x i32> + %1035 = insertelement <2 x float> poison, float %.pn297553, i64 0, !dbg !77 + %1036 = insertelement <2 x float> %1035, float %.pn295554, i64 1, !dbg !77 + %1037 = fptrunc <2 x float> %1036 to <2 x half>, !dbg !77 + %1038 = insertelement <2 x float> poison, float %.pn293555, i64 0, !dbg !77 + %1039 = insertelement <2 x float> %1038, float %.pn291556, i64 1, !dbg !77 + %1040 = fptrunc <2 x float> %1039 to <2 x half>, !dbg !77 + %1041 = insertelement <2 x float> poison, float %.pn289557, i64 0, !dbg !77 + %1042 = insertelement <2 x float> %1041, float %.pn287558, i64 1, !dbg !77 + %1043 = fptrunc <2 x float> %1042 to <2 x half>, !dbg !77 + %1044 = shufflevector <2 x half> %1043, <2 x half> poison, <8 x i32> + %1045 = insertelement <2 x float> poison, float %.pn285559, i64 0, !dbg !77 + %1046 = insertelement <2 x float> %1045, float %.pn283560, i64 1, !dbg !77 + %1047 = fptrunc <2 x float> %1046 to <2 x half>, !dbg !77 + %1048 = shufflevector <2 x half> %1047, <2 x half> poison, <8 x i32> + tail call void @llvm.amdgcn.s.setprio(i16 1), !dbg !78 + tail call void @llvm.amdgcn.s.waitcnt(i32 -49164), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !79 + %1049 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %584, !dbg !47 + %1050 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1049), !dbg !47, !alias.scope !52, !noalias !48 + %1051 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %586, !dbg !47 + %1052 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1051), !dbg !47, !alias.scope !52, !noalias !48 + %1053 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %588, !dbg !47 + %1054 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1053), !dbg !47, !alias.scope !52, !noalias !48 + %1055 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %590, !dbg !47 + %1056 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1055), !dbg !47, !alias.scope !52, !noalias !48 + %1057 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %592, !dbg !47 + %1058 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1057), !dbg !47, !alias.scope !52, !noalias !48 + %1059 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %594, !dbg !47 + %1060 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1059), !dbg !47, !alias.scope !52, !noalias !48 + %1061 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %596, !dbg !47 + %1062 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1061), !dbg !47, !alias.scope !52, !noalias !48 + %1063 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %598, !dbg !47 + %1064 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1063), !dbg !47, !alias.scope !52, !noalias !48 + %1065 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %602, !dbg !47 + %1066 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %583, !dbg !47 + %1067 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1066), !dbg !47, !alias.scope !52, !noalias !48 + %1068 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %585, !dbg !47 + %1069 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1068), !dbg !47, !alias.scope !52, !noalias !48 + %1070 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %587, !dbg !47 + %1071 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1070), !dbg !47, !alias.scope !52, !noalias !48 + %1072 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %589, !dbg !47 + %1073 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1072), !dbg !47, !alias.scope !52, !noalias !48 + %1074 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %591, !dbg !47 + %1075 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1074), !dbg !47, !alias.scope !52, !noalias !48 + %1076 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %593, !dbg !47 + %1077 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1076), !dbg !47, !alias.scope !52, !noalias !48 + %1078 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %595, !dbg !47 + %1079 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1078), !dbg !47, !alias.scope !52, !noalias !48 + %1080 = getelementptr inbounds nuw half, ptr addrspace(3) %1065, i32 %597, !dbg !47 + %1081 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1080), !dbg !47, !alias.scope !52, !noalias !48 + %1082 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %605, !dbg !47 + %1083 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %583, !dbg !47 + %1084 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1083), !dbg !47, !alias.scope !52, !noalias !48 + %1085 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %585, !dbg !47 + %1086 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1085), !dbg !47, !alias.scope !52, !noalias !48 + %1087 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %587, !dbg !47 + %1088 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1087), !dbg !47, !alias.scope !52, !noalias !48 + %1089 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %589, !dbg !47 + %1090 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1089), !dbg !47, !alias.scope !52, !noalias !48 + %1091 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %591, !dbg !47 + %1092 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1091), !dbg !47, !alias.scope !52, !noalias !48 + %1093 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %593, !dbg !47 + %1094 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1093), !dbg !47, !alias.scope !52, !noalias !48 + %1095 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %595, !dbg !47 + %1096 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1095), !dbg !47, !alias.scope !52, !noalias !48 + %1097 = getelementptr inbounds nuw half, ptr addrspace(3) %1082, i32 %597, !dbg !47 + %1098 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1097), !dbg !47, !alias.scope !52, !noalias !48 + %1099 = getelementptr inbounds nuw half, ptr addrspace(3) %689, i32 %609, !dbg !47 + %1100 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %583, !dbg !47 + %1101 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1100), !dbg !47, !alias.scope !52, !noalias !48 + %1102 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %585, !dbg !47 + %1103 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1102), !dbg !47, !alias.scope !52, !noalias !48 + %1104 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %587, !dbg !47 + %1105 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1104), !dbg !47, !alias.scope !52, !noalias !48 + %1106 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %589, !dbg !47 + %1107 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1106), !dbg !47, !alias.scope !52, !noalias !48 + %1108 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %591, !dbg !47 + %1109 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1108), !dbg !47, !alias.scope !52, !noalias !48 + %1110 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %593, !dbg !47 + %1111 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1110), !dbg !47, !alias.scope !52, !noalias !48 + %1112 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %595, !dbg !47 + %1113 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1112), !dbg !47, !alias.scope !52, !noalias !48 + %1114 = getelementptr inbounds nuw half, ptr addrspace(3) %1099, i32 %597, !dbg !47 + %1115 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1114), !dbg !47, !alias.scope !52, !noalias !48 + %1116 = getelementptr half, ptr addrspace(1) %692, i64 %258, !dbg !51 + %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397, !dbg !55 + %1118 = add i32 %690, 1, !dbg !70 + %1119 = icmp slt i32 %1118, 2, !dbg !70 + %1120 = select i1 %1119, i32 %1118, i32 0, !dbg !70 + %.idx359 = shl i32 %1120, 14, !dbg !47 + %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359, !dbg !47 + %1122 = getelementptr inbounds nuw i8, ptr addrspace(3) %1121, i32 %.idx2, !dbg !47 + %1123 = getelementptr inbounds nuw i8, ptr addrspace(3) %1121, i32 %.idx4, !dbg !47 + %1124 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %1116, i16 %237, i32 2147483646, i32 159744), !dbg !47 + %1125 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %266, i32 %103), !dbg !47 + %1126 = shl i32 %1125, 1, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %1124, ptr addrspace(3) %1122, i32 16, i32 %1126, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %1127 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %266, i32 %104), !dbg !47 + %1128 = shl i32 %1127, 1, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %1124, ptr addrspace(3) nonnull %1123, i32 16, i32 %1128, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !80 + %1129 = shufflevector <2 x half> %995, <2 x half> %998, <8 x i32> , !dbg !81 + %1130 = shufflevector <8 x half> %1129, <8 x half> %1002, <8 x i32> , !dbg !81 + %1131 = shufflevector <8 x half> %1130, <8 x half> %1006, <8 x i32> , !dbg !81 + %1132 = shufflevector <2 x half> %1009, <2 x half> %1012, <8 x i32> , !dbg !81 + %1133 = shufflevector <8 x half> %1132, <8 x half> %1016, <8 x i32> , !dbg !81 + %1134 = shufflevector <8 x half> %1133, <8 x half> %1020, <8 x i32> , !dbg !81 + %1135 = shufflevector <2 x half> %1023, <2 x half> %1026, <8 x i32> , !dbg !81 + %1136 = shufflevector <8 x half> %1135, <8 x half> %1030, <8 x i32> , !dbg !81 + %1137 = shufflevector <8 x half> %1136, <8 x half> %1034, <8 x i32> , !dbg !81 + %1138 = shufflevector <2 x half> %1037, <2 x half> %1040, <8 x i32> , !dbg !81 + %1139 = shufflevector <8 x half> %1138, <8 x half> %1044, <8 x i32> , !dbg !81 + %1140 = shufflevector <8 x half> %1139, <8 x half> %1048, <8 x i32> , !dbg !81 + %1141 = shufflevector <4 x half> %1050, <4 x half> %1052, <8 x i32> , !dbg !81 + %1142 = shufflevector <4 x half> %1054, <4 x half> %1056, <8 x i32> , !dbg !81 + %1143 = shufflevector <4 x half> %1058, <4 x half> %1060, <8 x i32> , !dbg !81 + %1144 = shufflevector <4 x half> %1062, <4 x half> %1064, <8 x i32> , !dbg !81 + %1145 = shufflevector <4 x half> %1067, <4 x half> %1069, <8 x i32> , !dbg !81 + %1146 = shufflevector <4 x half> %1071, <4 x half> %1073, <8 x i32> , !dbg !81 + %1147 = shufflevector <4 x half> %1075, <4 x half> %1077, <8 x i32> , !dbg !81 + %1148 = shufflevector <4 x half> %1079, <4 x half> %1081, <8 x i32> , !dbg !81 + %1149 = shufflevector <4 x half> %1084, <4 x half> %1086, <8 x i32> , !dbg !81 + %1150 = shufflevector <4 x half> %1088, <4 x half> %1090, <8 x i32> , !dbg !81 + %1151 = shufflevector <4 x half> %1092, <4 x half> %1094, <8 x i32> , !dbg !81 + %1152 = shufflevector <4 x half> %1096, <4 x half> %1098, <8 x i32> , !dbg !81 + %1153 = shufflevector <4 x half> %1101, <4 x half> %1103, <8 x i32> , !dbg !81 + %1154 = shufflevector <4 x half> %1105, <4 x half> %1107, <8 x i32> , !dbg !81 + %1155 = shufflevector <4 x half> %1109, <4 x half> %1111, <8 x i32> , !dbg !81 + %1156 = shufflevector <4 x half> %1113, <4 x half> %1115, <8 x i32> , !dbg !81 + %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> , !dbg !81 + %1158 = shufflevector <2 x float> %961, <2 x float> poison, <16 x i32> , !dbg !81 + %1159 = shufflevector <16 x float> %1157, <16 x float> %1158, <16 x i32> , !dbg !81 + %1160 = shufflevector <2 x float> %962, <2 x float> poison, <16 x i32> , !dbg !81 + %1161 = shufflevector <16 x float> %1159, <16 x float> %1160, <16 x i32> , !dbg !81 + %1162 = shufflevector <2 x float> %963, <2 x float> poison, <16 x i32> , !dbg !81 + %1163 = shufflevector <16 x float> %1161, <16 x float> %1162, <16 x i32> , !dbg !81 + %1164 = shufflevector <2 x float> %964, <2 x float> poison, <16 x i32> , !dbg !81 + %1165 = shufflevector <16 x float> %1163, <16 x float> %1164, <16 x i32> , !dbg !81 + %1166 = shufflevector <2 x float> %965, <2 x float> poison, <16 x i32> , !dbg !81 + %1167 = shufflevector <16 x float> %1165, <16 x float> %1166, <16 x i32> , !dbg !81 + %1168 = shufflevector <2 x float> %966, <2 x float> poison, <16 x i32> , !dbg !81 + %1169 = shufflevector <16 x float> %1167, <16 x float> %1168, <16 x i32> , !dbg !81 + %1170 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1141, <8 x half> %1131, <16 x float> %1169, i32 0, i32 0, i32 0), !dbg !81 + tail call void @llvm.amdgcn.s.setprio(i16 0), !dbg !82 + %1171 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1142, <8 x half> %1134, <16 x float> %1170, i32 0, i32 0, i32 0), !dbg !81 + %1172 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1143, <8 x half> %1137, <16 x float> %1171, i32 0, i32 0, i32 0), !dbg !81 + %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1144, <8 x half> %1140, <16 x float> %1172, i32 0, i32 0, i32 0), !dbg !81 + %1174 = shufflevector <2 x float> %967, <2 x float> %968, <16 x i32> , !dbg !81 + %1175 = shufflevector <2 x float> %969, <2 x float> poison, <16 x i32> , !dbg !81 + %1176 = shufflevector <16 x float> %1174, <16 x float> %1175, <16 x i32> , !dbg !81 + %1177 = shufflevector <2 x float> %970, <2 x float> poison, <16 x i32> , !dbg !81 + %1178 = shufflevector <16 x float> %1176, <16 x float> %1177, <16 x i32> , !dbg !81 + %1179 = shufflevector <2 x float> %971, <2 x float> poison, <16 x i32> , !dbg !81 + %1180 = shufflevector <16 x float> %1178, <16 x float> %1179, <16 x i32> , !dbg !81 + %1181 = shufflevector <2 x float> %972, <2 x float> poison, <16 x i32> , !dbg !81 + %1182 = shufflevector <16 x float> %1180, <16 x float> %1181, <16 x i32> , !dbg !81 + %1183 = shufflevector <2 x float> %973, <2 x float> poison, <16 x i32> , !dbg !81 + %1184 = shufflevector <16 x float> %1182, <16 x float> %1183, <16 x i32> , !dbg !81 + %1185 = shufflevector <2 x float> %974, <2 x float> poison, <16 x i32> , !dbg !81 + %1186 = shufflevector <16 x float> %1184, <16 x float> %1185, <16 x i32> , !dbg !81 + %1187 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1145, <8 x half> %1131, <16 x float> %1186, i32 0, i32 0, i32 0), !dbg !81 + %1188 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1146, <8 x half> %1134, <16 x float> %1187, i32 0, i32 0, i32 0), !dbg !81 + %1189 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1147, <8 x half> %1137, <16 x float> %1188, i32 0, i32 0, i32 0), !dbg !81 + %1190 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1148, <8 x half> %1140, <16 x float> %1189, i32 0, i32 0, i32 0), !dbg !81 + %1191 = shufflevector <2 x float> %975, <2 x float> %976, <16 x i32> , !dbg !81 + %1192 = shufflevector <2 x float> %977, <2 x float> poison, <16 x i32> , !dbg !81 + %1193 = shufflevector <16 x float> %1191, <16 x float> %1192, <16 x i32> , !dbg !81 + %1194 = shufflevector <2 x float> %978, <2 x float> poison, <16 x i32> , !dbg !81 + %1195 = shufflevector <16 x float> %1193, <16 x float> %1194, <16 x i32> , !dbg !81 + %1196 = shufflevector <2 x float> %979, <2 x float> poison, <16 x i32> , !dbg !81 + %1197 = shufflevector <16 x float> %1195, <16 x float> %1196, <16 x i32> , !dbg !81 + %1198 = shufflevector <2 x float> %980, <2 x float> poison, <16 x i32> , !dbg !81 + %1199 = shufflevector <16 x float> %1197, <16 x float> %1198, <16 x i32> , !dbg !81 + %1200 = shufflevector <2 x float> %981, <2 x float> poison, <16 x i32> , !dbg !81 + %1201 = shufflevector <16 x float> %1199, <16 x float> %1200, <16 x i32> , !dbg !81 + %1202 = shufflevector <2 x float> %982, <2 x float> poison, <16 x i32> , !dbg !81 + %1203 = shufflevector <16 x float> %1201, <16 x float> %1202, <16 x i32> , !dbg !81 + %1204 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1149, <8 x half> %1131, <16 x float> %1203, i32 0, i32 0, i32 0), !dbg !81 + %1205 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1150, <8 x half> %1134, <16 x float> %1204, i32 0, i32 0, i32 0), !dbg !81 + %1206 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1151, <8 x half> %1137, <16 x float> %1205, i32 0, i32 0, i32 0), !dbg !81 + %1207 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1152, <8 x half> %1140, <16 x float> %1206, i32 0, i32 0, i32 0), !dbg !81 + %1208 = shufflevector <2 x float> %983, <2 x float> %984, <16 x i32> , !dbg !81 + %1209 = shufflevector <2 x float> %985, <2 x float> poison, <16 x i32> , !dbg !81 + %1210 = shufflevector <16 x float> %1208, <16 x float> %1209, <16 x i32> , !dbg !81 + %1211 = shufflevector <2 x float> %986, <2 x float> poison, <16 x i32> , !dbg !81 + %1212 = shufflevector <16 x float> %1210, <16 x float> %1211, <16 x i32> , !dbg !81 + %1213 = shufflevector <2 x float> %987, <2 x float> poison, <16 x i32> , !dbg !81 + %1214 = shufflevector <16 x float> %1212, <16 x float> %1213, <16 x i32> , !dbg !81 + %1215 = shufflevector <2 x float> %988, <2 x float> poison, <16 x i32> , !dbg !81 + %1216 = shufflevector <16 x float> %1214, <16 x float> %1215, <16 x i32> , !dbg !81 + %1217 = shufflevector <2 x float> %989, <2 x float> poison, <16 x i32> , !dbg !81 + %1218 = shufflevector <16 x float> %1216, <16 x float> %1217, <16 x i32> , !dbg !81 + %1219 = shufflevector <2 x float> %990, <2 x float> poison, <16 x i32> , !dbg !81 + %1220 = shufflevector <16 x float> %1218, <16 x float> %1219, <16 x i32> , !dbg !81 + %1221 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1153, <8 x half> %1131, <16 x float> %1220, i32 0, i32 0, i32 0), !dbg !81 + %1222 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1154, <8 x half> %1134, <16 x float> %1221, i32 0, i32 0, i32 0), !dbg !81 + %1223 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1155, <8 x half> %1137, <16 x float> %1222, i32 0, i32 0, i32 0), !dbg !81 + %1224 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1156, <8 x half> %1140, <16 x float> %1223, i32 0, i32 0, i32 0), !dbg !81 + %1225 = tail call float @llvm.maxnum.f32(float %879, float %880), !dbg !56 + %1226 = tail call float @llvm.maxnum.f32(float %1225, float %881), !dbg !56 + %1227 = tail call float @llvm.maxnum.f32(float %1226, float %882), !dbg !56 + %1228 = tail call float @llvm.maxnum.f32(float %1227, float %883), !dbg !56 + %1229 = tail call float @llvm.maxnum.f32(float %1228, float %884), !dbg !56 + %1230 = tail call float @llvm.maxnum.f32(float %1229, float %885), !dbg !56 + %1231 = tail call float @llvm.maxnum.f32(float %1230, float %886), !dbg !56 + %1232 = tail call float @llvm.maxnum.f32(float %1231, float %887), !dbg !56 + %1233 = tail call float @llvm.maxnum.f32(float %1232, float %888), !dbg !56 + %1234 = tail call float @llvm.maxnum.f32(float %1233, float %889), !dbg !56 + %1235 = tail call float @llvm.maxnum.f32(float %1234, float %890), !dbg !56 + %1236 = tail call float @llvm.maxnum.f32(float %1235, float %891), !dbg !56 + %1237 = tail call float @llvm.maxnum.f32(float %1236, float %892), !dbg !56 + %1238 = tail call float @llvm.maxnum.f32(float %1237, float %893), !dbg !56 + %1239 = tail call float @llvm.maxnum.f32(float %1238, float %894), !dbg !56 + %1240 = tail call float @llvm.maxnum.f32(float %1239, float %903), !dbg !56 + %1241 = tail call float @llvm.maxnum.f32(float %1240, float %904), !dbg !56 + %1242 = tail call float @llvm.maxnum.f32(float %1241, float %905), !dbg !56 + %1243 = tail call float @llvm.maxnum.f32(float %1242, float %906), !dbg !56 + %1244 = tail call float @llvm.maxnum.f32(float %1243, float %907), !dbg !56 + %1245 = tail call float @llvm.maxnum.f32(float %1244, float %908), !dbg !56 + %1246 = tail call float @llvm.maxnum.f32(float %1245, float %909), !dbg !56 + %1247 = tail call float @llvm.maxnum.f32(float %1246, float %910), !dbg !56 + %1248 = tail call float @llvm.maxnum.f32(float %1247, float %911), !dbg !56 + %1249 = tail call float @llvm.maxnum.f32(float %1248, float %912), !dbg !56 + %1250 = tail call float @llvm.maxnum.f32(float %1249, float %913), !dbg !56 + %1251 = tail call float @llvm.maxnum.f32(float %1250, float %914), !dbg !56 + %1252 = tail call float @llvm.maxnum.f32(float %1251, float %915), !dbg !56 + %1253 = tail call float @llvm.maxnum.f32(float %1252, float %916), !dbg !56 + %1254 = tail call float @llvm.maxnum.f32(float %1253, float %917), !dbg !56 + %1255 = tail call float @llvm.maxnum.f32(float %1254, float %918), !dbg !56 + %1256 = bitcast float %1255 to i32, !dbg !59 + %1257 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %1256, i32 %1256, i1 false, i1 false), !dbg !59 + %1258 = extractvalue { i32, i32 } %1257, 0, !dbg !59 + %1259 = extractvalue { i32, i32 } %1257, 1, !dbg !59 + %1260 = bitcast i32 %1258 to float, !dbg !59 + %1261 = bitcast i32 %1259 to float, !dbg !59 + %1262 = tail call float @llvm.maxnum.f32(float %1260, float %1261), !dbg !56 + %1263 = tail call float @llvm.maxnum.f32(float %.pn26400, float %1262), !dbg !60 + %1264 = fmul float %1263, 0x3FC0527DC0000000, !dbg !61 + %1265 = fmul float %879, 0x3FC0527DC0000000, !dbg !62 + %1266 = fmul float %880, 0x3FC0527DC0000000, !dbg !62 + %1267 = fmul float %881, 0x3FC0527DC0000000, !dbg !62 + %1268 = fmul float %882, 0x3FC0527DC0000000, !dbg !62 + %1269 = fmul float %883, 0x3FC0527DC0000000, !dbg !62 + %1270 = fmul float %884, 0x3FC0527DC0000000, !dbg !62 + %1271 = fmul float %885, 0x3FC0527DC0000000, !dbg !62 + %1272 = fmul float %886, 0x3FC0527DC0000000, !dbg !62 + %1273 = fmul float %887, 0x3FC0527DC0000000, !dbg !62 + %1274 = fmul float %888, 0x3FC0527DC0000000, !dbg !62 + %1275 = fmul float %889, 0x3FC0527DC0000000, !dbg !62 + %1276 = fmul float %890, 0x3FC0527DC0000000, !dbg !62 + %1277 = fmul float %891, 0x3FC0527DC0000000, !dbg !62 + %1278 = fmul float %892, 0x3FC0527DC0000000, !dbg !62 + %1279 = fmul float %893, 0x3FC0527DC0000000, !dbg !62 + %1280 = fmul float %894, 0x3FC0527DC0000000, !dbg !62 + %1281 = fmul float %903, 0x3FC0527DC0000000, !dbg !62 + %1282 = fmul float %904, 0x3FC0527DC0000000, !dbg !62 + %1283 = fmul float %905, 0x3FC0527DC0000000, !dbg !62 + %1284 = fmul float %906, 0x3FC0527DC0000000, !dbg !62 + %1285 = fmul float %907, 0x3FC0527DC0000000, !dbg !62 + %1286 = fmul float %908, 0x3FC0527DC0000000, !dbg !62 + %1287 = fmul float %909, 0x3FC0527DC0000000, !dbg !62 + %1288 = fmul float %910, 0x3FC0527DC0000000, !dbg !62 + %1289 = fmul float %911, 0x3FC0527DC0000000, !dbg !62 + %1290 = fmul float %912, 0x3FC0527DC0000000, !dbg !62 + %1291 = fmul float %913, 0x3FC0527DC0000000, !dbg !62 + %1292 = fmul float %914, 0x3FC0527DC0000000, !dbg !62 + %1293 = fmul float %915, 0x3FC0527DC0000000, !dbg !62 + %1294 = fmul float %916, 0x3FC0527DC0000000, !dbg !62 + %1295 = fmul float %917, 0x3FC0527DC0000000, !dbg !62 + %1296 = fmul float %918, 0x3FC0527DC0000000, !dbg !62 + %1297 = fsub float %1265, %1264, !dbg !63 + %1298 = fsub float %1266, %1264, !dbg !63 + %1299 = fsub float %1267, %1264, !dbg !63 + %1300 = fsub float %1268, %1264, !dbg !63 + %1301 = fsub float %1269, %1264, !dbg !63 + %1302 = fsub float %1270, %1264, !dbg !63 + %1303 = fsub float %1271, %1264, !dbg !63 + %1304 = fsub float %1272, %1264, !dbg !63 + %1305 = fsub float %1273, %1264, !dbg !63 + %1306 = fsub float %1274, %1264, !dbg !63 + %1307 = fsub float %1275, %1264, !dbg !63 + %1308 = fsub float %1276, %1264, !dbg !63 + %1309 = fsub float %1277, %1264, !dbg !63 + %1310 = fsub float %1278, %1264, !dbg !63 + %1311 = fsub float %1279, %1264, !dbg !63 + %1312 = fsub float %1280, %1264, !dbg !63 + %1313 = fsub float %1281, %1264, !dbg !63 + %1314 = fsub float %1282, %1264, !dbg !63 + %1315 = fsub float %1283, %1264, !dbg !63 + %1316 = fsub float %1284, %1264, !dbg !63 + %1317 = fsub float %1285, %1264, !dbg !63 + %1318 = fsub float %1286, %1264, !dbg !63 + %1319 = fsub float %1287, %1264, !dbg !63 + %1320 = fsub float %1288, %1264, !dbg !63 + %1321 = fsub float %1289, %1264, !dbg !63 + %1322 = fsub float %1290, %1264, !dbg !63 + %1323 = fsub float %1291, %1264, !dbg !63 + %1324 = fsub float %1292, %1264, !dbg !63 + %1325 = fsub float %1293, %1264, !dbg !63 + %1326 = fsub float %1294, %1264, !dbg !63 + %1327 = fsub float %1295, %1264, !dbg !63 + %1328 = fsub float %1296, %1264, !dbg !63 + %1329 = tail call float @llvm.amdgcn.exp2.f32(float %1297), !dbg !64 + %1330 = tail call float @llvm.amdgcn.exp2.f32(float %1298), !dbg !64 + %1331 = tail call float @llvm.amdgcn.exp2.f32(float %1299), !dbg !64 + %1332 = tail call float @llvm.amdgcn.exp2.f32(float %1300), !dbg !64 + %1333 = tail call float @llvm.amdgcn.exp2.f32(float %1301), !dbg !64 + %1334 = tail call float @llvm.amdgcn.exp2.f32(float %1302), !dbg !64 + %1335 = tail call float @llvm.amdgcn.exp2.f32(float %1303), !dbg !64 + %1336 = tail call float @llvm.amdgcn.exp2.f32(float %1304), !dbg !64 + %1337 = tail call float @llvm.amdgcn.exp2.f32(float %1305), !dbg !64 + %1338 = tail call float @llvm.amdgcn.exp2.f32(float %1306), !dbg !64 + %1339 = tail call float @llvm.amdgcn.exp2.f32(float %1307), !dbg !64 + %1340 = tail call float @llvm.amdgcn.exp2.f32(float %1308), !dbg !64 + %1341 = tail call float @llvm.amdgcn.exp2.f32(float %1309), !dbg !64 + %1342 = tail call float @llvm.amdgcn.exp2.f32(float %1310), !dbg !64 + %1343 = tail call float @llvm.amdgcn.exp2.f32(float %1311), !dbg !64 + %1344 = tail call float @llvm.amdgcn.exp2.f32(float %1312), !dbg !64 + %1345 = tail call float @llvm.amdgcn.exp2.f32(float %1313), !dbg !64 + %1346 = tail call float @llvm.amdgcn.exp2.f32(float %1314), !dbg !64 + %1347 = tail call float @llvm.amdgcn.exp2.f32(float %1315), !dbg !64 + %1348 = tail call float @llvm.amdgcn.exp2.f32(float %1316), !dbg !64 + %1349 = tail call float @llvm.amdgcn.exp2.f32(float %1317), !dbg !64 + %1350 = tail call float @llvm.amdgcn.exp2.f32(float %1318), !dbg !64 + %1351 = tail call float @llvm.amdgcn.exp2.f32(float %1319), !dbg !64 + %1352 = tail call float @llvm.amdgcn.exp2.f32(float %1320), !dbg !64 + %1353 = tail call float @llvm.amdgcn.exp2.f32(float %1321), !dbg !64 + %1354 = tail call float @llvm.amdgcn.exp2.f32(float %1322), !dbg !64 + %1355 = tail call float @llvm.amdgcn.exp2.f32(float %1323), !dbg !64 + %1356 = tail call float @llvm.amdgcn.exp2.f32(float %1324), !dbg !64 + %1357 = tail call float @llvm.amdgcn.exp2.f32(float %1325), !dbg !64 + %1358 = tail call float @llvm.amdgcn.exp2.f32(float %1326), !dbg !64 + %1359 = tail call float @llvm.amdgcn.exp2.f32(float %1327), !dbg !64 + %1360 = tail call float @llvm.amdgcn.exp2.f32(float %1328), !dbg !64 + %1361 = fmul float %.pn26400, 0x3FC0527DC0000000, !dbg !83 + %1362 = fsub float %1361, %1264, !dbg !65 + %1363 = tail call float @llvm.amdgcn.exp2.f32(float %1362), !dbg !66 + tail call void @llvm.amdgcn.s.setprio(i16 1), !dbg !84 + tail call void @llvm.amdgcn.s.waitcnt(i32 -49164), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !85 + %1364 = getelementptr half, ptr addrspace(3) %687, i32 %676, !dbg !47 + %1365 = getelementptr half, ptr addrspace(3) %1364, i32 %677, !dbg !47 + %1366 = load <8 x half>, ptr addrspace(3) %1365, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1367 = getelementptr half, ptr addrspace(3) %687, i32 %678, !dbg !47 + %1368 = getelementptr half, ptr addrspace(3) %1367, i32 %677, !dbg !47 + %1369 = load <8 x half>, ptr addrspace(3) %1368, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1370 = getelementptr half, ptr addrspace(3) %687, i32 %679, !dbg !47 + %1371 = getelementptr half, ptr addrspace(3) %1370, i32 %677, !dbg !47 + %1372 = load <8 x half>, ptr addrspace(3) %1371, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1373 = getelementptr half, ptr addrspace(3) %687, i32 %680, !dbg !47 + %1374 = getelementptr half, ptr addrspace(3) %1373, i32 %677, !dbg !47 + %1375 = load <8 x half>, ptr addrspace(3) %1374, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1376 = getelementptr half, ptr addrspace(3) %687, i32 %681, !dbg !47 + %1377 = getelementptr half, ptr addrspace(3) %1376, i32 %677, !dbg !47 + %1378 = load <8 x half>, ptr addrspace(3) %1377, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1379 = getelementptr half, ptr addrspace(3) %687, i32 %682, !dbg !47 + %1380 = getelementptr half, ptr addrspace(3) %1379, i32 %677, !dbg !47 + %1381 = load <8 x half>, ptr addrspace(3) %1380, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1382 = getelementptr half, ptr addrspace(3) %687, i32 %683, !dbg !47 + %1383 = getelementptr half, ptr addrspace(3) %1382, i32 %677, !dbg !47 + %1384 = load <8 x half>, ptr addrspace(3) %1383, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1385 = getelementptr half, ptr addrspace(3) %687, i32 %684, !dbg !47 + %1386 = getelementptr half, ptr addrspace(3) %1385, i32 %677, !dbg !47 + %1387 = load <8 x half>, ptr addrspace(3) %1386, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1388 = getelementptr half, ptr addrspace(3) %1364, i32 %685, !dbg !47 + %1389 = load <8 x half>, ptr addrspace(3) %1388, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1390 = getelementptr half, ptr addrspace(3) %1367, i32 %685, !dbg !47 + %1391 = load <8 x half>, ptr addrspace(3) %1390, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1392 = getelementptr half, ptr addrspace(3) %1370, i32 %685, !dbg !47 + %1393 = load <8 x half>, ptr addrspace(3) %1392, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1394 = getelementptr half, ptr addrspace(3) %1373, i32 %685, !dbg !47 + %1395 = load <8 x half>, ptr addrspace(3) %1394, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1396 = getelementptr half, ptr addrspace(3) %1376, i32 %685, !dbg !47 + %1397 = load <8 x half>, ptr addrspace(3) %1396, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1398 = getelementptr half, ptr addrspace(3) %1379, i32 %685, !dbg !47 + %1399 = load <8 x half>, ptr addrspace(3) %1398, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1400 = getelementptr half, ptr addrspace(3) %1382, i32 %685, !dbg !47 + %1401 = load <8 x half>, ptr addrspace(3) %1400, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1402 = getelementptr half, ptr addrspace(3) %1385, i32 %685, !dbg !47 + %1403 = load <8 x half>, ptr addrspace(3) %1402, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %.idx367 = shl i32 %690, 14, !dbg !47 + %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367, !dbg !47 + %1405 = getelementptr inbounds nuw i8, ptr addrspace(3) %1404, i32 %.idx2, !dbg !47 + %1406 = getelementptr inbounds nuw i8, ptr addrspace(3) %1404, i32 %.idx4, !dbg !47 + %1407 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %1117, i16 %333, i32 2147483646, i32 159744), !dbg !47 + %1408 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %338, i32 %113), !dbg !47 + %1409 = shl i32 %1408, 1, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %1407, ptr addrspace(3) %1405, i32 16, i32 %1409, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %1410 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %338, i32 %114), !dbg !47 + %1411 = shl i32 %1410, 1, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %1407, ptr addrspace(3) nonnull %1406, i32 16, i32 %1411, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !86 + %1412 = add nuw nsw i32 %694, 64, !dbg !70 + %1413 = icmp samesign ult i32 %694, 7936, !dbg !70 + %1414 = shufflevector <8 x half> %1366, <8 x half> poison, <2 x i32> + %1415 = shufflevector <8 x half> %1366, <8 x half> poison, <2 x i32> + %1416 = shufflevector <8 x half> %1366, <8 x half> poison, <2 x i32> + %1417 = shufflevector <8 x half> %1366, <8 x half> poison, <2 x i32> + %1418 = shufflevector <8 x half> %1369, <8 x half> poison, <2 x i32> + %1419 = shufflevector <8 x half> %1369, <8 x half> poison, <2 x i32> + %1420 = shufflevector <8 x half> %1369, <8 x half> poison, <2 x i32> + %1421 = shufflevector <8 x half> %1369, <8 x half> poison, <2 x i32> + %1422 = shufflevector <8 x half> %1372, <8 x half> poison, <2 x i32> + %1423 = shufflevector <8 x half> %1372, <8 x half> poison, <2 x i32> + %1424 = shufflevector <8 x half> %1372, <8 x half> poison, <2 x i32> + %1425 = shufflevector <8 x half> %1372, <8 x half> poison, <2 x i32> + %1426 = shufflevector <8 x half> %1375, <8 x half> poison, <2 x i32> + %1427 = shufflevector <8 x half> %1375, <8 x half> poison, <2 x i32> + %1428 = shufflevector <8 x half> %1375, <8 x half> poison, <2 x i32> + %1429 = shufflevector <8 x half> %1375, <8 x half> poison, <2 x i32> + %1430 = shufflevector <8 x half> %1378, <8 x half> poison, <2 x i32> + %1431 = shufflevector <8 x half> %1378, <8 x half> poison, <2 x i32> + %1432 = shufflevector <8 x half> %1378, <8 x half> poison, <2 x i32> + %1433 = shufflevector <8 x half> %1378, <8 x half> poison, <2 x i32> + %1434 = shufflevector <8 x half> %1381, <8 x half> poison, <2 x i32> + %1435 = shufflevector <8 x half> %1381, <8 x half> poison, <2 x i32> + %1436 = shufflevector <8 x half> %1381, <8 x half> poison, <2 x i32> + %1437 = shufflevector <8 x half> %1381, <8 x half> poison, <2 x i32> + %1438 = shufflevector <8 x half> %1384, <8 x half> poison, <2 x i32> + %1439 = shufflevector <8 x half> %1384, <8 x half> poison, <2 x i32> + %1440 = shufflevector <8 x half> %1384, <8 x half> poison, <2 x i32> + %1441 = shufflevector <8 x half> %1384, <8 x half> poison, <2 x i32> + %1442 = shufflevector <8 x half> %1387, <8 x half> poison, <2 x i32> + %1443 = shufflevector <8 x half> %1387, <8 x half> poison, <2 x i32> + %1444 = shufflevector <8 x half> %1387, <8 x half> poison, <2 x i32> + %1445 = shufflevector <8 x half> %1387, <8 x half> poison, <2 x i32> + %1446 = shufflevector <8 x half> %1389, <8 x half> poison, <2 x i32> + %1447 = shufflevector <8 x half> %1389, <8 x half> poison, <2 x i32> + %1448 = shufflevector <8 x half> %1389, <8 x half> poison, <2 x i32> + %1449 = shufflevector <8 x half> %1389, <8 x half> poison, <2 x i32> + %1450 = shufflevector <8 x half> %1391, <8 x half> poison, <2 x i32> + %1451 = shufflevector <8 x half> %1391, <8 x half> poison, <2 x i32> + %1452 = shufflevector <8 x half> %1391, <8 x half> poison, <2 x i32> + %1453 = shufflevector <8 x half> %1391, <8 x half> poison, <2 x i32> + %1454 = shufflevector <8 x half> %1393, <8 x half> poison, <2 x i32> + %1455 = shufflevector <8 x half> %1393, <8 x half> poison, <2 x i32> + %1456 = shufflevector <8 x half> %1393, <8 x half> poison, <2 x i32> + %1457 = shufflevector <8 x half> %1393, <8 x half> poison, <2 x i32> + %1458 = shufflevector <8 x half> %1395, <8 x half> poison, <2 x i32> + %1459 = shufflevector <8 x half> %1395, <8 x half> poison, <2 x i32> + %1460 = shufflevector <8 x half> %1395, <8 x half> poison, <2 x i32> + %1461 = shufflevector <8 x half> %1395, <8 x half> poison, <2 x i32> + %1462 = shufflevector <8 x half> %1397, <8 x half> poison, <2 x i32> + %1463 = shufflevector <8 x half> %1397, <8 x half> poison, <2 x i32> + %1464 = shufflevector <8 x half> %1397, <8 x half> poison, <2 x i32> + %1465 = shufflevector <8 x half> %1397, <8 x half> poison, <2 x i32> + %1466 = shufflevector <8 x half> %1399, <8 x half> poison, <2 x i32> + %1467 = shufflevector <8 x half> %1399, <8 x half> poison, <2 x i32> + %1468 = shufflevector <8 x half> %1399, <8 x half> poison, <2 x i32> + %1469 = shufflevector <8 x half> %1399, <8 x half> poison, <2 x i32> + %1470 = shufflevector <8 x half> %1401, <8 x half> poison, <2 x i32> + %1471 = shufflevector <8 x half> %1401, <8 x half> poison, <2 x i32> + %1472 = shufflevector <8 x half> %1401, <8 x half> poison, <2 x i32> + %1473 = shufflevector <8 x half> %1401, <8 x half> poison, <2 x i32> + %1474 = shufflevector <8 x half> %1403, <8 x half> poison, <2 x i32> + %1475 = shufflevector <8 x half> %1403, <8 x half> poison, <2 x i32> + %1476 = shufflevector <8 x half> %1403, <8 x half> poison, <2 x i32> + %1477 = shufflevector <8 x half> %1403, <8 x half> poison, <2 x i32> + %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1479 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1480 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1481 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1482 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1483 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1484 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1485 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> + %1486 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1487 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1488 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1489 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1490 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1491 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1492 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1493 = shufflevector <16 x float> %1224, <16 x float> poison, <2 x i32> + %1494 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1495 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1496 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1497 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1498 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1499 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1500 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1501 = shufflevector <16 x float> %1207, <16 x float> poison, <2 x i32> + %1502 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1503 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1504 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1505 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1506 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1507 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1508 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + %1509 = shufflevector <16 x float> %1190, <16 x float> poison, <2 x i32> + br i1 %1413, label %686, label %1510, !dbg !70 + +1510: ; preds = %686 + br i1 %573, label %1511, label %1512, !dbg !87 + +1511: ; preds = %1510 + tail call void @llvm.amdgcn.s.barrier(), !dbg !87 + br label %1512, !dbg !87 + +1512: ; preds = %1511, %1510 + %1513 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1366, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %1514 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1369, <8 x half> %218, <16 x float> %1513, i32 0, i32 0, i32 0), !dbg !54 + %1515 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1372, <8 x half> %220, <16 x float> %1514, i32 0, i32 0, i32 0), !dbg !54 + %1516 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1375, <8 x half> %222, <16 x float> %1515, i32 0, i32 0, i32 0), !dbg !54 + %1517 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1378, <8 x half> %224, <16 x float> %1516, i32 0, i32 0, i32 0), !dbg !54 + %1518 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1381, <8 x half> %226, <16 x float> %1517, i32 0, i32 0, i32 0), !dbg !54 + %1519 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1384, <8 x half> %228, <16 x float> %1518, i32 0, i32 0, i32 0), !dbg !54 + %1520 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1387, <8 x half> %230, <16 x float> %1519, i32 0, i32 0, i32 0), !dbg !54 + %1521 = extractelement <16 x float> %1520, i64 0, !dbg !54 + %1522 = extractelement <16 x float> %1520, i64 1, !dbg !54 + %1523 = extractelement <16 x float> %1520, i64 2, !dbg !54 + %1524 = extractelement <16 x float> %1520, i64 3, !dbg !54 + %1525 = extractelement <16 x float> %1520, i64 4, !dbg !54 + %1526 = extractelement <16 x float> %1520, i64 5, !dbg !54 + %1527 = extractelement <16 x float> %1520, i64 6, !dbg !54 + %1528 = extractelement <16 x float> %1520, i64 7, !dbg !54 + %1529 = extractelement <16 x float> %1520, i64 8, !dbg !54 + %1530 = extractelement <16 x float> %1520, i64 9, !dbg !54 + %1531 = extractelement <16 x float> %1520, i64 10, !dbg !54 + %1532 = extractelement <16 x float> %1520, i64 11, !dbg !54 + %1533 = extractelement <16 x float> %1520, i64 12, !dbg !54 + %1534 = extractelement <16 x float> %1520, i64 13, !dbg !54 + %1535 = extractelement <16 x float> %1520, i64 14, !dbg !54 + %1536 = extractelement <16 x float> %1520, i64 15, !dbg !54 + %1537 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1389, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %1538 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1391, <8 x half> %218, <16 x float> %1537, i32 0, i32 0, i32 0), !dbg !54 + %1539 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1393, <8 x half> %220, <16 x float> %1538, i32 0, i32 0, i32 0), !dbg !54 + %1540 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1395, <8 x half> %222, <16 x float> %1539, i32 0, i32 0, i32 0), !dbg !54 + %1541 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1397, <8 x half> %224, <16 x float> %1540, i32 0, i32 0, i32 0), !dbg !54 + %1542 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1399, <8 x half> %226, <16 x float> %1541, i32 0, i32 0, i32 0), !dbg !54 + %1543 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1401, <8 x half> %228, <16 x float> %1542, i32 0, i32 0, i32 0), !dbg !54 + %1544 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1403, <8 x half> %230, <16 x float> %1543, i32 0, i32 0, i32 0), !dbg !54 + %1545 = extractelement <16 x float> %1544, i64 0, !dbg !54 + %1546 = extractelement <16 x float> %1544, i64 1, !dbg !54 + %1547 = extractelement <16 x float> %1544, i64 2, !dbg !54 + %1548 = extractelement <16 x float> %1544, i64 3, !dbg !54 + %1549 = extractelement <16 x float> %1544, i64 4, !dbg !54 + %1550 = extractelement <16 x float> %1544, i64 5, !dbg !54 + %1551 = extractelement <16 x float> %1544, i64 6, !dbg !54 + %1552 = extractelement <16 x float> %1544, i64 7, !dbg !54 + %1553 = extractelement <16 x float> %1544, i64 8, !dbg !54 + %1554 = extractelement <16 x float> %1544, i64 9, !dbg !54 + %1555 = extractelement <16 x float> %1544, i64 10, !dbg !54 + %1556 = extractelement <16 x float> %1544, i64 11, !dbg !54 + %1557 = extractelement <16 x float> %1544, i64 12, !dbg !54 + %1558 = extractelement <16 x float> %1544, i64 13, !dbg !54 + %1559 = extractelement <16 x float> %1544, i64 14, !dbg !54 + %1560 = extractelement <16 x float> %1544, i64 15, !dbg !54 + %1561 = fadd float %1329, %1330, !dbg !72 + %1562 = fadd float %1561, %1331, !dbg !72 + %1563 = fadd float %1562, %1332, !dbg !72 + %1564 = fadd float %1563, %1333, !dbg !72 + %1565 = fadd float %1564, %1334, !dbg !72 + %1566 = fadd float %1565, %1335, !dbg !72 + %1567 = fadd float %1566, %1336, !dbg !72 + %1568 = fadd float %1567, %1337, !dbg !72 + %1569 = fadd float %1568, %1338, !dbg !72 + %1570 = fadd float %1569, %1339, !dbg !72 + %1571 = fadd float %1570, %1340, !dbg !72 + %1572 = fadd float %1571, %1341, !dbg !72 + %1573 = fadd float %1572, %1342, !dbg !72 + %1574 = fadd float %1573, %1343, !dbg !72 + %1575 = fadd float %1574, %1344, !dbg !72 + %1576 = fadd float %1575, %1345, !dbg !72 + %1577 = fadd float %1576, %1346, !dbg !72 + %1578 = fadd float %1577, %1347, !dbg !72 + %1579 = fadd float %1578, %1348, !dbg !72 + %1580 = fadd float %1579, %1349, !dbg !72 + %1581 = fadd float %1580, %1350, !dbg !72 + %1582 = fadd float %1581, %1351, !dbg !72 + %1583 = fadd float %1582, %1352, !dbg !72 + %1584 = fadd float %1583, %1353, !dbg !72 + %1585 = fadd float %1584, %1354, !dbg !72 + %1586 = fadd float %1585, %1355, !dbg !72 + %1587 = fadd float %1586, %1356, !dbg !72 + %1588 = fadd float %1587, %1357, !dbg !72 + %1589 = fadd float %1588, %1358, !dbg !72 + %1590 = fadd float %1589, %1359, !dbg !72 + %1591 = fadd float %1590, %1360, !dbg !72 + %1592 = bitcast float %1591 to i32, !dbg !73 + %1593 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %1592, i32 %1592, i1 false, i1 false), !dbg !73 + %1594 = extractvalue { i32, i32 } %1593, 0, !dbg !73 + %1595 = extractvalue { i32, i32 } %1593, 1, !dbg !73 + %1596 = bitcast i32 %1594 to float, !dbg !73 + %1597 = bitcast i32 %1595 to float, !dbg !73 + %1598 = fadd float %1596, %1597, !dbg !72 + %1599 = insertelement <2 x float> poison, float %1363, i64 0, !dbg !74 + %1600 = fmul float %992, %1363, !dbg !75 + %1601 = fadd float %1600, %1598, !dbg !76 + %1602 = insertelement <2 x float> poison, float %1329, i64 0, !dbg !77 + %1603 = insertelement <2 x float> %1602, float %1330, i64 1, !dbg !77 + %1604 = fptrunc <2 x float> %1603 to <2 x half>, !dbg !77 + %1605 = insertelement <2 x float> poison, float %1331, i64 0, !dbg !77 + %1606 = insertelement <2 x float> %1605, float %1332, i64 1, !dbg !77 + %1607 = fptrunc <2 x float> %1606 to <2 x half>, !dbg !77 + %1608 = insertelement <2 x float> poison, float %1333, i64 0, !dbg !77 + %1609 = insertelement <2 x float> %1608, float %1334, i64 1, !dbg !77 + %1610 = fptrunc <2 x float> %1609 to <2 x half>, !dbg !77 + %1611 = shufflevector <2 x half> %1610, <2 x half> poison, <8 x i32> + %1612 = insertelement <2 x float> poison, float %1335, i64 0, !dbg !77 + %1613 = insertelement <2 x float> %1612, float %1336, i64 1, !dbg !77 + %1614 = fptrunc <2 x float> %1613 to <2 x half>, !dbg !77 + %1615 = shufflevector <2 x half> %1614, <2 x half> poison, <8 x i32> + %1616 = insertelement <2 x float> poison, float %1337, i64 0, !dbg !77 + %1617 = insertelement <2 x float> %1616, float %1338, i64 1, !dbg !77 + %1618 = fptrunc <2 x float> %1617 to <2 x half>, !dbg !77 + %1619 = insertelement <2 x float> poison, float %1339, i64 0, !dbg !77 + %1620 = insertelement <2 x float> %1619, float %1340, i64 1, !dbg !77 + %1621 = fptrunc <2 x float> %1620 to <2 x half>, !dbg !77 + %1622 = insertelement <2 x float> poison, float %1341, i64 0, !dbg !77 + %1623 = insertelement <2 x float> %1622, float %1342, i64 1, !dbg !77 + %1624 = fptrunc <2 x float> %1623 to <2 x half>, !dbg !77 + %1625 = shufflevector <2 x half> %1624, <2 x half> poison, <8 x i32> + %1626 = insertelement <2 x float> poison, float %1343, i64 0, !dbg !77 + %1627 = insertelement <2 x float> %1626, float %1344, i64 1, !dbg !77 + %1628 = fptrunc <2 x float> %1627 to <2 x half>, !dbg !77 + %1629 = shufflevector <2 x half> %1628, <2 x half> poison, <8 x i32> + %1630 = insertelement <2 x float> poison, float %1345, i64 0, !dbg !77 + %1631 = insertelement <2 x float> %1630, float %1346, i64 1, !dbg !77 + %1632 = fptrunc <2 x float> %1631 to <2 x half>, !dbg !77 + %1633 = insertelement <2 x float> poison, float %1347, i64 0, !dbg !77 + %1634 = insertelement <2 x float> %1633, float %1348, i64 1, !dbg !77 + %1635 = fptrunc <2 x float> %1634 to <2 x half>, !dbg !77 + %1636 = insertelement <2 x float> poison, float %1349, i64 0, !dbg !77 + %1637 = insertelement <2 x float> %1636, float %1350, i64 1, !dbg !77 + %1638 = fptrunc <2 x float> %1637 to <2 x half>, !dbg !77 + %1639 = shufflevector <2 x half> %1638, <2 x half> poison, <8 x i32> + %1640 = insertelement <2 x float> poison, float %1351, i64 0, !dbg !77 + %1641 = insertelement <2 x float> %1640, float %1352, i64 1, !dbg !77 + %1642 = fptrunc <2 x float> %1641 to <2 x half>, !dbg !77 + %1643 = shufflevector <2 x half> %1642, <2 x half> poison, <8 x i32> + %1644 = insertelement <2 x float> poison, float %1353, i64 0, !dbg !77 + %1645 = insertelement <2 x float> %1644, float %1354, i64 1, !dbg !77 + %1646 = fptrunc <2 x float> %1645 to <2 x half>, !dbg !77 + %1647 = insertelement <2 x float> poison, float %1355, i64 0, !dbg !77 + %1648 = insertelement <2 x float> %1647, float %1356, i64 1, !dbg !77 + %1649 = fptrunc <2 x float> %1648 to <2 x half>, !dbg !77 + %1650 = insertelement <2 x float> poison, float %1357, i64 0, !dbg !77 + %1651 = insertelement <2 x float> %1650, float %1358, i64 1, !dbg !77 + %1652 = fptrunc <2 x float> %1651 to <2 x half>, !dbg !77 + %1653 = shufflevector <2 x half> %1652, <2 x half> poison, <8 x i32> + %1654 = insertelement <2 x float> poison, float %1359, i64 0, !dbg !77 + %1655 = insertelement <2 x float> %1654, float %1360, i64 1, !dbg !77 + %1656 = fptrunc <2 x float> %1655 to <2 x half>, !dbg !77 + %1657 = shufflevector <2 x half> %1656, <2 x half> poison, <8 x i32> + tail call void @llvm.amdgcn.s.waitcnt(i32 -49168), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + %1658 = and i32 %576, 12, !dbg !47 + %1659 = or disjoint i32 %1658, %84, !dbg !47 + %1660 = or disjoint i32 %580, %1659, !dbg !47 + %1661 = or disjoint i32 %1660, %583, !dbg !47 + %1662 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1661, !dbg !47 + %1663 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1662), !dbg !47, !alias.scope !52, !noalias !48 + %1664 = or disjoint i32 %1660, %585, !dbg !47 + %1665 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1664, !dbg !47 + %1666 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1665), !dbg !47, !alias.scope !52, !noalias !48 + %1667 = or disjoint i32 %1660, %587, !dbg !47 + %1668 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1667, !dbg !47 + %1669 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1668), !dbg !47, !alias.scope !52, !noalias !48 + %1670 = or disjoint i32 %1660, %589, !dbg !47 + %1671 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1670, !dbg !47 + %1672 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1671), !dbg !47, !alias.scope !52, !noalias !48 + %1673 = or disjoint i32 %1660, %591, !dbg !47 + %1674 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1673, !dbg !47 + %1675 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1674), !dbg !47, !alias.scope !52, !noalias !48 + %1676 = or disjoint i32 %1660, %593, !dbg !47 + %1677 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1676, !dbg !47 + %1678 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1677), !dbg !47, !alias.scope !52, !noalias !48 + %1679 = or disjoint i32 %1660, %595, !dbg !47 + %1680 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1679, !dbg !47 + %1681 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1680), !dbg !47, !alias.scope !52, !noalias !48 + %1682 = or disjoint i32 %1660, %597, !dbg !47 + %1683 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1682, !dbg !47 + %1684 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1683), !dbg !47, !alias.scope !52, !noalias !48 + %1685 = or disjoint i32 %1658, 32, !dbg !47 + %1686 = xor i32 %1685, %84, !dbg !47 + %1687 = or disjoint i32 %35, %1686, !dbg !47 + %1688 = or disjoint i32 %1687, %85, !dbg !47 + %1689 = or disjoint i32 %1688, %583, !dbg !47 + %1690 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1689, !dbg !47 + %1691 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1690), !dbg !47, !alias.scope !52, !noalias !48 + %1692 = or disjoint i32 %1688, %585, !dbg !47 + %1693 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1692, !dbg !47 + %1694 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1693), !dbg !47, !alias.scope !52, !noalias !48 + %1695 = or disjoint i32 %1688, %587, !dbg !47 + %1696 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1695, !dbg !47 + %1697 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1696), !dbg !47, !alias.scope !52, !noalias !48 + %1698 = or disjoint i32 %1688, %589, !dbg !47 + %1699 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1698, !dbg !47 + %1700 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1699), !dbg !47, !alias.scope !52, !noalias !48 + %1701 = or disjoint i32 %1688, %591, !dbg !47 + %1702 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1701, !dbg !47 + %1703 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1702), !dbg !47, !alias.scope !52, !noalias !48 + %1704 = or disjoint i32 %1688, %593, !dbg !47 + %1705 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1704, !dbg !47 + %1706 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1705), !dbg !47, !alias.scope !52, !noalias !48 + %1707 = or disjoint i32 %1688, %595, !dbg !47 + %1708 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1707, !dbg !47 + %1709 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1708), !dbg !47, !alias.scope !52, !noalias !48 + %1710 = or disjoint i32 %1688, %597, !dbg !47 + %1711 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1710, !dbg !47 + %1712 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1711), !dbg !47, !alias.scope !52, !noalias !48 + %1713 = or disjoint i32 %1659, 64, !dbg !47 + %1714 = xor i32 %1713, %85, !dbg !47 + %1715 = or disjoint i32 %1714, %35, !dbg !47 + %1716 = or disjoint i32 %1715, %583, !dbg !47 + %1717 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1716, !dbg !47 + %1718 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1717), !dbg !47, !alias.scope !52, !noalias !48 + %1719 = or disjoint i32 %1715, %585, !dbg !47 + %1720 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1719, !dbg !47 + %1721 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1720), !dbg !47, !alias.scope !52, !noalias !48 + %1722 = or disjoint i32 %1715, %587, !dbg !47 + %1723 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1722, !dbg !47 + %1724 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1723), !dbg !47, !alias.scope !52, !noalias !48 + %1725 = or disjoint i32 %1715, %589, !dbg !47 + %1726 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1725, !dbg !47 + %1727 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1726), !dbg !47, !alias.scope !52, !noalias !48 + %1728 = or disjoint i32 %1715, %591, !dbg !47 + %1729 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1728, !dbg !47 + %1730 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1729), !dbg !47, !alias.scope !52, !noalias !48 + %1731 = or disjoint i32 %1715, %593, !dbg !47 + %1732 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1731, !dbg !47 + %1733 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1732), !dbg !47, !alias.scope !52, !noalias !48 + %1734 = or disjoint i32 %1715, %595, !dbg !47 + %1735 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1734, !dbg !47 + %1736 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1735), !dbg !47, !alias.scope !52, !noalias !48 + %1737 = or disjoint i32 %1715, %597, !dbg !47 + %1738 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1737, !dbg !47 + %1739 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1738), !dbg !47, !alias.scope !52, !noalias !48 + %1740 = or disjoint i32 %1658, 96, !dbg !47 + %1741 = xor i32 %607, %1740, !dbg !47 + %1742 = or disjoint i32 %1741, %35, !dbg !47 + %1743 = or disjoint i32 %1742, %583, !dbg !47 + %1744 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1743, !dbg !47 + %1745 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1744), !dbg !47, !alias.scope !52, !noalias !48 + %1746 = or disjoint i32 %1742, %585, !dbg !47 + %1747 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1746, !dbg !47 + %1748 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1747), !dbg !47, !alias.scope !52, !noalias !48 + %1749 = or disjoint i32 %1742, %587, !dbg !47 + %1750 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1749, !dbg !47 + %1751 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1750), !dbg !47, !alias.scope !52, !noalias !48 + %1752 = or disjoint i32 %1742, %589, !dbg !47 + %1753 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1752, !dbg !47 + %1754 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1753), !dbg !47, !alias.scope !52, !noalias !48 + %1755 = or disjoint i32 %1742, %591, !dbg !47 + %1756 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1755, !dbg !47 + %1757 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1756), !dbg !47, !alias.scope !52, !noalias !48 + %1758 = or disjoint i32 %1742, %593, !dbg !47 + %1759 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1758, !dbg !47 + %1760 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1759), !dbg !47, !alias.scope !52, !noalias !48 + %1761 = or disjoint i32 %1742, %595, !dbg !47 + %1762 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1761, !dbg !47 + %1763 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1762), !dbg !47, !alias.scope !52, !noalias !48 + %1764 = or disjoint i32 %1742, %597, !dbg !47 + %1765 = getelementptr inbounds nuw half, ptr addrspace(3) %688, i32 %1764, !dbg !47 + %1766 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1765), !dbg !47, !alias.scope !52, !noalias !48 + %1767 = getelementptr half, ptr addrspace(1) %1117, i64 %397, !dbg !55 + %1768 = tail call float @llvm.maxnum.f32(float %1521, float %1522), !dbg !56 + %1769 = tail call float @llvm.maxnum.f32(float %1768, float %1523), !dbg !56 + %1770 = tail call float @llvm.maxnum.f32(float %1769, float %1524), !dbg !56 + %1771 = tail call float @llvm.maxnum.f32(float %1770, float %1525), !dbg !56 + %1772 = tail call float @llvm.maxnum.f32(float %1771, float %1526), !dbg !56 + %1773 = tail call float @llvm.maxnum.f32(float %1772, float %1527), !dbg !56 + %1774 = tail call float @llvm.maxnum.f32(float %1773, float %1528), !dbg !56 + %1775 = tail call float @llvm.maxnum.f32(float %1774, float %1529), !dbg !56 + %1776 = tail call float @llvm.maxnum.f32(float %1775, float %1530), !dbg !56 + %1777 = tail call float @llvm.maxnum.f32(float %1776, float %1531), !dbg !56 + %1778 = tail call float @llvm.maxnum.f32(float %1777, float %1532), !dbg !56 + %1779 = tail call float @llvm.maxnum.f32(float %1778, float %1533), !dbg !56 + %1780 = tail call float @llvm.maxnum.f32(float %1779, float %1534), !dbg !56 + %1781 = tail call float @llvm.maxnum.f32(float %1780, float %1535), !dbg !56 + %1782 = tail call float @llvm.maxnum.f32(float %1781, float %1536), !dbg !56 + %1783 = tail call float @llvm.maxnum.f32(float %1782, float %1545), !dbg !56 + %1784 = tail call float @llvm.maxnum.f32(float %1783, float %1546), !dbg !56 + %1785 = tail call float @llvm.maxnum.f32(float %1784, float %1547), !dbg !56 + %1786 = tail call float @llvm.maxnum.f32(float %1785, float %1548), !dbg !56 + %1787 = tail call float @llvm.maxnum.f32(float %1786, float %1549), !dbg !56 + %1788 = tail call float @llvm.maxnum.f32(float %1787, float %1550), !dbg !56 + %1789 = tail call float @llvm.maxnum.f32(float %1788, float %1551), !dbg !56 + %1790 = tail call float @llvm.maxnum.f32(float %1789, float %1552), !dbg !56 + %1791 = tail call float @llvm.maxnum.f32(float %1790, float %1553), !dbg !56 + %1792 = tail call float @llvm.maxnum.f32(float %1791, float %1554), !dbg !56 + %1793 = tail call float @llvm.maxnum.f32(float %1792, float %1555), !dbg !56 + %1794 = tail call float @llvm.maxnum.f32(float %1793, float %1556), !dbg !56 + %1795 = tail call float @llvm.maxnum.f32(float %1794, float %1557), !dbg !56 + %1796 = tail call float @llvm.maxnum.f32(float %1795, float %1558), !dbg !56 + %1797 = tail call float @llvm.maxnum.f32(float %1796, float %1559), !dbg !56 + %1798 = tail call float @llvm.maxnum.f32(float %1797, float %1560), !dbg !56 + %1799 = bitcast float %1798 to i32, !dbg !59 + %1800 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %1799, i32 %1799, i1 false, i1 false), !dbg !59 + %1801 = extractvalue { i32, i32 } %1800, 0, !dbg !59 + %1802 = extractvalue { i32, i32 } %1800, 1, !dbg !59 + %1803 = bitcast i32 %1801 to float, !dbg !59 + %1804 = bitcast i32 %1802 to float, !dbg !59 + %1805 = tail call float @llvm.maxnum.f32(float %1803, float %1804), !dbg !56 + %1806 = tail call float @llvm.maxnum.f32(float %1263, float %1805), !dbg !60 + %1807 = fmul float %1521, 0x3FC0527DC0000000, !dbg !62 + %1808 = fmul float %1522, 0x3FC0527DC0000000, !dbg !62 + %1809 = fmul float %1523, 0x3FC0527DC0000000, !dbg !62 + %1810 = fmul float %1524, 0x3FC0527DC0000000, !dbg !62 + %1811 = fmul float %1525, 0x3FC0527DC0000000, !dbg !62 + %1812 = fmul float %1526, 0x3FC0527DC0000000, !dbg !62 + %1813 = fmul float %1527, 0x3FC0527DC0000000, !dbg !62 + %1814 = fmul float %1528, 0x3FC0527DC0000000, !dbg !62 + %1815 = fmul float %1529, 0x3FC0527DC0000000, !dbg !62 + %1816 = fmul float %1530, 0x3FC0527DC0000000, !dbg !62 + %1817 = fmul float %1531, 0x3FC0527DC0000000, !dbg !62 + %1818 = fmul float %1532, 0x3FC0527DC0000000, !dbg !62 + %1819 = fmul float %1533, 0x3FC0527DC0000000, !dbg !62 + %1820 = fmul float %1534, 0x3FC0527DC0000000, !dbg !62 + %1821 = fmul float %1535, 0x3FC0527DC0000000, !dbg !62 + %1822 = fmul float %1536, 0x3FC0527DC0000000, !dbg !62 + %1823 = fmul float %1545, 0x3FC0527DC0000000, !dbg !62 + %1824 = fmul float %1546, 0x3FC0527DC0000000, !dbg !62 + %1825 = fmul float %1547, 0x3FC0527DC0000000, !dbg !62 + %1826 = fmul float %1548, 0x3FC0527DC0000000, !dbg !62 + %1827 = fmul float %1549, 0x3FC0527DC0000000, !dbg !62 + %1828 = fmul float %1550, 0x3FC0527DC0000000, !dbg !62 + %1829 = fmul float %1551, 0x3FC0527DC0000000, !dbg !62 + %1830 = fmul float %1552, 0x3FC0527DC0000000, !dbg !62 + %1831 = fmul float %1553, 0x3FC0527DC0000000, !dbg !62 + %1832 = fmul float %1554, 0x3FC0527DC0000000, !dbg !62 + %1833 = fmul float %1555, 0x3FC0527DC0000000, !dbg !62 + %1834 = fmul float %1556, 0x3FC0527DC0000000, !dbg !62 + %1835 = fmul float %1557, 0x3FC0527DC0000000, !dbg !62 + %1836 = fmul float %1558, 0x3FC0527DC0000000, !dbg !62 + %1837 = fmul float %1559, 0x3FC0527DC0000000, !dbg !62 + %1838 = fmul float %1560, 0x3FC0527DC0000000, !dbg !62 + %1839 = shufflevector <2 x half> %1604, <2 x half> %1607, <8 x i32> , !dbg !81 + %1840 = shufflevector <8 x half> %1839, <8 x half> %1611, <8 x i32> , !dbg !81 + %1841 = shufflevector <8 x half> %1840, <8 x half> %1615, <8 x i32> , !dbg !81 + %1842 = shufflevector <2 x half> %1618, <2 x half> %1621, <8 x i32> , !dbg !81 + %1843 = shufflevector <8 x half> %1842, <8 x half> %1625, <8 x i32> , !dbg !81 + %1844 = shufflevector <8 x half> %1843, <8 x half> %1629, <8 x i32> , !dbg !81 + %1845 = shufflevector <2 x half> %1632, <2 x half> %1635, <8 x i32> , !dbg !81 + %1846 = shufflevector <8 x half> %1845, <8 x half> %1639, <8 x i32> , !dbg !81 + %1847 = shufflevector <8 x half> %1846, <8 x half> %1643, <8 x i32> , !dbg !81 + %1848 = shufflevector <2 x half> %1646, <2 x half> %1649, <8 x i32> , !dbg !81 + %1849 = shufflevector <8 x half> %1848, <8 x half> %1653, <8 x i32> , !dbg !81 + %1850 = shufflevector <8 x half> %1849, <8 x half> %1657, <8 x i32> , !dbg !81 + %1851 = shufflevector <4 x half> %1663, <4 x half> %1666, <8 x i32> , !dbg !81 + %1852 = shufflevector <4 x half> %1669, <4 x half> %1672, <8 x i32> , !dbg !81 + %1853 = shufflevector <4 x half> %1675, <4 x half> %1678, <8 x i32> , !dbg !81 + %1854 = shufflevector <4 x half> %1681, <4 x half> %1684, <8 x i32> , !dbg !81 + %1855 = shufflevector <4 x half> %1691, <4 x half> %1694, <8 x i32> , !dbg !81 + %1856 = shufflevector <4 x half> %1697, <4 x half> %1700, <8 x i32> , !dbg !81 + %1857 = shufflevector <4 x half> %1703, <4 x half> %1706, <8 x i32> , !dbg !81 + %1858 = shufflevector <4 x half> %1709, <4 x half> %1712, <8 x i32> , !dbg !81 + %1859 = shufflevector <4 x half> %1718, <4 x half> %1721, <8 x i32> , !dbg !81 + %1860 = shufflevector <4 x half> %1724, <4 x half> %1727, <8 x i32> , !dbg !81 + %1861 = shufflevector <4 x half> %1730, <4 x half> %1733, <8 x i32> , !dbg !81 + %1862 = shufflevector <4 x half> %1736, <4 x half> %1739, <8 x i32> , !dbg !81 + %1863 = shufflevector <4 x half> %1745, <4 x half> %1748, <8 x i32> , !dbg !81 + %1864 = shufflevector <4 x half> %1751, <4 x half> %1754, <8 x i32> , !dbg !81 + %1865 = shufflevector <4 x half> %1757, <4 x half> %1760, <8 x i32> , !dbg !81 + %1866 = shufflevector <4 x half> %1763, <4 x half> %1766, <8 x i32> , !dbg !81 + %1867 = shufflevector <2 x float> %1599, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %1868 = fmul <16 x float> %1173, %1867, !dbg !81 + %1869 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1851, <8 x half> %1841, <16 x float> %1868, i32 0, i32 0, i32 0), !dbg !81 + %1870 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1852, <8 x half> %1844, <16 x float> %1869, i32 0, i32 0, i32 0), !dbg !81 + %1871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1853, <8 x half> %1847, <16 x float> %1870, i32 0, i32 0, i32 0), !dbg !81 + %1872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1854, <8 x half> %1850, <16 x float> %1871, i32 0, i32 0, i32 0), !dbg !81 + %1873 = shufflevector <2 x float> %1599, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %1874 = fmul <16 x float> %1190, %1873, !dbg !81 + %1875 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1855, <8 x half> %1841, <16 x float> %1874, i32 0, i32 0, i32 0), !dbg !81 + %1876 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1856, <8 x half> %1844, <16 x float> %1875, i32 0, i32 0, i32 0), !dbg !81 + %1877 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1857, <8 x half> %1847, <16 x float> %1876, i32 0, i32 0, i32 0), !dbg !81 + %1878 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1858, <8 x half> %1850, <16 x float> %1877, i32 0, i32 0, i32 0), !dbg !81 + %1879 = shufflevector <2 x float> %1599, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %1880 = fmul <16 x float> %1207, %1879, !dbg !81 + %1881 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1859, <8 x half> %1841, <16 x float> %1880, i32 0, i32 0, i32 0), !dbg !81 + %1882 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1860, <8 x half> %1844, <16 x float> %1881, i32 0, i32 0, i32 0), !dbg !81 + %1883 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1861, <8 x half> %1847, <16 x float> %1882, i32 0, i32 0, i32 0), !dbg !81 + %1884 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1862, <8 x half> %1850, <16 x float> %1883, i32 0, i32 0, i32 0), !dbg !81 + %1885 = shufflevector <2 x float> %1599, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %1886 = fmul <16 x float> %1224, %1885, !dbg !81 + %1887 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1863, <8 x half> %1841, <16 x float> %1886, i32 0, i32 0, i32 0), !dbg !81 + %1888 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1864, <8 x half> %1844, <16 x float> %1887, i32 0, i32 0, i32 0), !dbg !81 + %1889 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1865, <8 x half> %1847, <16 x float> %1888, i32 0, i32 0, i32 0), !dbg !81 + %1890 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1866, <8 x half> %1850, <16 x float> %1889, i32 0, i32 0, i32 0), !dbg !81 + tail call void @llvm.amdgcn.s.waitcnt(i32 -49168), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + %1891 = getelementptr half, ptr addrspace(3) %1121, i32 %198, !dbg !47 + %1892 = getelementptr half, ptr addrspace(3) %1891, i32 %276, !dbg !47 + %1893 = load <8 x half>, ptr addrspace(3) %1892, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1894 = getelementptr half, ptr addrspace(3) %1121, i32 %201, !dbg !47 + %1895 = getelementptr half, ptr addrspace(3) %1894, i32 %276, !dbg !47 + %1896 = load <8 x half>, ptr addrspace(3) %1895, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1897 = getelementptr half, ptr addrspace(3) %1121, i32 %203, !dbg !47 + %1898 = getelementptr half, ptr addrspace(3) %1897, i32 %276, !dbg !47 + %1899 = load <8 x half>, ptr addrspace(3) %1898, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1900 = getelementptr half, ptr addrspace(3) %1121, i32 %205, !dbg !47 + %1901 = getelementptr half, ptr addrspace(3) %1900, i32 %276, !dbg !47 + %1902 = load <8 x half>, ptr addrspace(3) %1901, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1903 = getelementptr half, ptr addrspace(3) %1121, i32 %207, !dbg !47 + %1904 = getelementptr half, ptr addrspace(3) %1903, i32 %276, !dbg !47 + %1905 = load <8 x half>, ptr addrspace(3) %1904, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1906 = getelementptr half, ptr addrspace(3) %1121, i32 %209, !dbg !47 + %1907 = getelementptr half, ptr addrspace(3) %1906, i32 %276, !dbg !47 + %1908 = load <8 x half>, ptr addrspace(3) %1907, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1909 = getelementptr half, ptr addrspace(3) %1121, i32 %211, !dbg !47 + %1910 = getelementptr half, ptr addrspace(3) %1909, i32 %276, !dbg !47 + %1911 = load <8 x half>, ptr addrspace(3) %1910, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1912 = getelementptr half, ptr addrspace(3) %1121, i32 %213, !dbg !47 + %1913 = getelementptr half, ptr addrspace(3) %1912, i32 %276, !dbg !47 + %1914 = load <8 x half>, ptr addrspace(3) %1913, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1915 = getelementptr half, ptr addrspace(3) %1891, i32 %285, !dbg !47 + %1916 = load <8 x half>, ptr addrspace(3) %1915, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1917 = getelementptr half, ptr addrspace(3) %1894, i32 %285, !dbg !47 + %1918 = load <8 x half>, ptr addrspace(3) %1917, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1919 = getelementptr half, ptr addrspace(3) %1897, i32 %285, !dbg !47 + %1920 = load <8 x half>, ptr addrspace(3) %1919, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1921 = getelementptr half, ptr addrspace(3) %1900, i32 %285, !dbg !47 + %1922 = load <8 x half>, ptr addrspace(3) %1921, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1923 = getelementptr half, ptr addrspace(3) %1903, i32 %285, !dbg !47 + %1924 = load <8 x half>, ptr addrspace(3) %1923, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1925 = getelementptr half, ptr addrspace(3) %1906, i32 %285, !dbg !47 + %1926 = load <8 x half>, ptr addrspace(3) %1925, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1927 = getelementptr half, ptr addrspace(3) %1909, i32 %285, !dbg !47 + %1928 = load <8 x half>, ptr addrspace(3) %1927, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1929 = getelementptr half, ptr addrspace(3) %1912, i32 %285, !dbg !47 + %1930 = load <8 x half>, ptr addrspace(3) %1929, align 16, !dbg !47, !alias.scope !52, !noalias !48 + %1931 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx359, !dbg !47 + %1932 = getelementptr inbounds nuw i8, ptr addrspace(3) %1931, i32 %.idx2, !dbg !47 + %1933 = getelementptr inbounds nuw i8, ptr addrspace(3) %1931, i32 %.idx4, !dbg !47 + %1934 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %1767, i16 %333, i32 2147483646, i32 159744), !dbg !47 + %1935 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %338, i32 %113), !dbg !47 + %1936 = tail call i64 @llvm.amdgcn.ballot.i64(i1 true), !dbg !47 + %1937 = lshr i64 %1936, %340, !dbg !47 + %1938 = trunc i64 %1937 to i1, !dbg !47 + %1939 = shl i32 %1935, 1, !dbg !47 + %1940 = select i1 %1938, i32 %1939, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %1934, ptr addrspace(3) %1932, i32 16, i32 %1940, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %1941 = tail call i32 @llvm.amdgcn.ds.bpermute(i32 %338, i32 %114), !dbg !47 + %1942 = shl i32 %1941, 1, !dbg !47 + %1943 = select i1 %1938, i32 %1942, i32 -2147483648, !dbg !47 + tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %1934, ptr addrspace(3) nonnull %1933, i32 16, i32 %1943, i32 0, i32 0, i32 0), !dbg !47, !alias.scope !48 + %1944 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1893, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %1945 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1896, <8 x half> %218, <16 x float> %1944, i32 0, i32 0, i32 0), !dbg !54 + %1946 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1899, <8 x half> %220, <16 x float> %1945, i32 0, i32 0, i32 0), !dbg !54 + %1947 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1902, <8 x half> %222, <16 x float> %1946, i32 0, i32 0, i32 0), !dbg !54 + %1948 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1905, <8 x half> %224, <16 x float> %1947, i32 0, i32 0, i32 0), !dbg !54 + %1949 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1908, <8 x half> %226, <16 x float> %1948, i32 0, i32 0, i32 0), !dbg !54 + %1950 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1911, <8 x half> %228, <16 x float> %1949, i32 0, i32 0, i32 0), !dbg !54 + %1951 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1914, <8 x half> %230, <16 x float> %1950, i32 0, i32 0, i32 0), !dbg !54 + %1952 = extractelement <16 x float> %1951, i64 0, !dbg !54 + %1953 = extractelement <16 x float> %1951, i64 1, !dbg !54 + %1954 = extractelement <16 x float> %1951, i64 2, !dbg !54 + %1955 = extractelement <16 x float> %1951, i64 3, !dbg !54 + %1956 = extractelement <16 x float> %1951, i64 4, !dbg !54 + %1957 = extractelement <16 x float> %1951, i64 5, !dbg !54 + %1958 = extractelement <16 x float> %1951, i64 6, !dbg !54 + %1959 = extractelement <16 x float> %1951, i64 7, !dbg !54 + %1960 = extractelement <16 x float> %1951, i64 8, !dbg !54 + %1961 = extractelement <16 x float> %1951, i64 9, !dbg !54 + %1962 = extractelement <16 x float> %1951, i64 10, !dbg !54 + %1963 = extractelement <16 x float> %1951, i64 11, !dbg !54 + %1964 = extractelement <16 x float> %1951, i64 12, !dbg !54 + %1965 = extractelement <16 x float> %1951, i64 13, !dbg !54 + %1966 = extractelement <16 x float> %1951, i64 14, !dbg !54 + %1967 = extractelement <16 x float> %1951, i64 15, !dbg !54 + %1968 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1916, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0), !dbg !54 + %1969 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1918, <8 x half> %218, <16 x float> %1968, i32 0, i32 0, i32 0), !dbg !54 + %1970 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1920, <8 x half> %220, <16 x float> %1969, i32 0, i32 0, i32 0), !dbg !54 + %1971 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1922, <8 x half> %222, <16 x float> %1970, i32 0, i32 0, i32 0), !dbg !54 + %1972 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1924, <8 x half> %224, <16 x float> %1971, i32 0, i32 0, i32 0), !dbg !54 + %1973 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1926, <8 x half> %226, <16 x float> %1972, i32 0, i32 0, i32 0), !dbg !54 + %1974 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1928, <8 x half> %228, <16 x float> %1973, i32 0, i32 0, i32 0), !dbg !54 + %1975 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1930, <8 x half> %230, <16 x float> %1974, i32 0, i32 0, i32 0), !dbg !54 + %1976 = extractelement <16 x float> %1975, i64 0, !dbg !54 + %1977 = extractelement <16 x float> %1975, i64 1, !dbg !54 + %1978 = extractelement <16 x float> %1975, i64 2, !dbg !54 + %1979 = extractelement <16 x float> %1975, i64 3, !dbg !54 + %1980 = extractelement <16 x float> %1975, i64 4, !dbg !54 + %1981 = extractelement <16 x float> %1975, i64 5, !dbg !54 + %1982 = extractelement <16 x float> %1975, i64 6, !dbg !54 + %1983 = extractelement <16 x float> %1975, i64 7, !dbg !54 + %1984 = extractelement <16 x float> %1975, i64 8, !dbg !54 + %1985 = extractelement <16 x float> %1975, i64 9, !dbg !54 + %1986 = extractelement <16 x float> %1975, i64 10, !dbg !54 + %1987 = extractelement <16 x float> %1975, i64 11, !dbg !54 + %1988 = extractelement <16 x float> %1975, i64 12, !dbg !54 + %1989 = extractelement <16 x float> %1975, i64 13, !dbg !54 + %1990 = extractelement <16 x float> %1975, i64 14, !dbg !54 + %1991 = extractelement <16 x float> %1975, i64 15, !dbg !54 + tail call void @llvm.amdgcn.s.waitcnt(i32 -49166), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + %1992 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1661, !dbg !47 + %1993 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %1992), !dbg !47, !alias.scope !52, !noalias !48 + %1994 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1664, !dbg !47 + %1995 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1994), !dbg !47, !alias.scope !52, !noalias !48 + %1996 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1667, !dbg !47 + %1997 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1996), !dbg !47, !alias.scope !52, !noalias !48 + %1998 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1670, !dbg !47 + %1999 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %1998), !dbg !47, !alias.scope !52, !noalias !48 + %2000 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1673, !dbg !47 + %2001 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2000), !dbg !47, !alias.scope !52, !noalias !48 + %2002 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1676, !dbg !47 + %2003 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2002), !dbg !47, !alias.scope !52, !noalias !48 + %2004 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1679, !dbg !47 + %2005 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2004), !dbg !47, !alias.scope !52, !noalias !48 + %2006 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1682, !dbg !47 + %2007 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2006), !dbg !47, !alias.scope !52, !noalias !48 + %2008 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1689, !dbg !47 + %2009 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2008), !dbg !47, !alias.scope !52, !noalias !48 + %2010 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1692, !dbg !47 + %2011 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2010), !dbg !47, !alias.scope !52, !noalias !48 + %2012 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1695, !dbg !47 + %2013 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2012), !dbg !47, !alias.scope !52, !noalias !48 + %2014 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1698, !dbg !47 + %2015 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2014), !dbg !47, !alias.scope !52, !noalias !48 + %2016 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1701, !dbg !47 + %2017 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2016), !dbg !47, !alias.scope !52, !noalias !48 + %2018 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1704, !dbg !47 + %2019 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2018), !dbg !47, !alias.scope !52, !noalias !48 + %2020 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1707, !dbg !47 + %2021 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2020), !dbg !47, !alias.scope !52, !noalias !48 + %2022 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1710, !dbg !47 + %2023 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2022), !dbg !47, !alias.scope !52, !noalias !48 + %2024 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1716, !dbg !47 + %2025 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2024), !dbg !47, !alias.scope !52, !noalias !48 + %2026 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1719, !dbg !47 + %2027 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2026), !dbg !47, !alias.scope !52, !noalias !48 + %2028 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1722, !dbg !47 + %2029 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2028), !dbg !47, !alias.scope !52, !noalias !48 + %2030 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1725, !dbg !47 + %2031 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2030), !dbg !47, !alias.scope !52, !noalias !48 + %2032 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1728, !dbg !47 + %2033 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2032), !dbg !47, !alias.scope !52, !noalias !48 + %2034 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1731, !dbg !47 + %2035 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2034), !dbg !47, !alias.scope !52, !noalias !48 + %2036 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1734, !dbg !47 + %2037 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2036), !dbg !47, !alias.scope !52, !noalias !48 + %2038 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1737, !dbg !47 + %2039 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2038), !dbg !47, !alias.scope !52, !noalias !48 + %2040 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1743, !dbg !47 + %2041 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2040), !dbg !47, !alias.scope !52, !noalias !48 + %2042 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1746, !dbg !47 + %2043 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2042), !dbg !47, !alias.scope !52, !noalias !48 + %2044 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1749, !dbg !47 + %2045 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2044), !dbg !47, !alias.scope !52, !noalias !48 + %2046 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1752, !dbg !47 + %2047 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2046), !dbg !47, !alias.scope !52, !noalias !48 + %2048 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1755, !dbg !47 + %2049 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2048), !dbg !47, !alias.scope !52, !noalias !48 + %2050 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1758, !dbg !47 + %2051 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2050), !dbg !47, !alias.scope !52, !noalias !48 + %2052 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1761, !dbg !47 + %2053 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2052), !dbg !47, !alias.scope !52, !noalias !48 + %2054 = getelementptr inbounds nuw half, ptr addrspace(3) %1404, i32 %1764, !dbg !47 + %2055 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2054), !dbg !47, !alias.scope !52, !noalias !48 + %2056 = tail call float @llvm.maxnum.f32(float %1952, float %1953), !dbg !56 + %2057 = tail call float @llvm.maxnum.f32(float %2056, float %1954), !dbg !56 + %2058 = tail call float @llvm.maxnum.f32(float %2057, float %1955), !dbg !56 + %2059 = tail call float @llvm.maxnum.f32(float %2058, float %1956), !dbg !56 + %2060 = tail call float @llvm.maxnum.f32(float %2059, float %1957), !dbg !56 + %2061 = tail call float @llvm.maxnum.f32(float %2060, float %1958), !dbg !56 + %2062 = tail call float @llvm.maxnum.f32(float %2061, float %1959), !dbg !56 + %2063 = tail call float @llvm.maxnum.f32(float %2062, float %1960), !dbg !56 + %2064 = tail call float @llvm.maxnum.f32(float %2063, float %1961), !dbg !56 + %2065 = tail call float @llvm.maxnum.f32(float %2064, float %1962), !dbg !56 + %2066 = tail call float @llvm.maxnum.f32(float %2065, float %1963), !dbg !56 + %2067 = tail call float @llvm.maxnum.f32(float %2066, float %1964), !dbg !56 + %2068 = tail call float @llvm.maxnum.f32(float %2067, float %1965), !dbg !56 + %2069 = tail call float @llvm.maxnum.f32(float %2068, float %1966), !dbg !56 + %2070 = tail call float @llvm.maxnum.f32(float %2069, float %1967), !dbg !56 + %2071 = tail call float @llvm.maxnum.f32(float %2070, float %1976), !dbg !56 + %2072 = tail call float @llvm.maxnum.f32(float %2071, float %1977), !dbg !56 + %2073 = tail call float @llvm.maxnum.f32(float %2072, float %1978), !dbg !56 + %2074 = tail call float @llvm.maxnum.f32(float %2073, float %1979), !dbg !56 + %2075 = tail call float @llvm.maxnum.f32(float %2074, float %1980), !dbg !56 + %2076 = tail call float @llvm.maxnum.f32(float %2075, float %1981), !dbg !56 + %2077 = tail call float @llvm.maxnum.f32(float %2076, float %1982), !dbg !56 + %2078 = tail call float @llvm.maxnum.f32(float %2077, float %1983), !dbg !56 + %2079 = tail call float @llvm.maxnum.f32(float %2078, float %1984), !dbg !56 + %2080 = tail call float @llvm.maxnum.f32(float %2079, float %1985), !dbg !56 + %2081 = tail call float @llvm.maxnum.f32(float %2080, float %1986), !dbg !56 + %2082 = tail call float @llvm.maxnum.f32(float %2081, float %1987), !dbg !56 + %2083 = tail call float @llvm.maxnum.f32(float %2082, float %1988), !dbg !56 + %2084 = tail call float @llvm.maxnum.f32(float %2083, float %1989), !dbg !56 + %2085 = tail call float @llvm.maxnum.f32(float %2084, float %1990), !dbg !56 + %2086 = tail call float @llvm.maxnum.f32(float %2085, float %1991), !dbg !56 + %2087 = bitcast float %2086 to i32, !dbg !59 + %2088 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %2087, i32 %2087, i1 false, i1 false), !dbg !59 + %2089 = extractvalue { i32, i32 } %2088, 0, !dbg !59 + %2090 = extractvalue { i32, i32 } %2088, 1, !dbg !59 + %2091 = bitcast i32 %2089 to float, !dbg !59 + %2092 = bitcast i32 %2090 to float, !dbg !59 + %2093 = tail call float @llvm.maxnum.f32(float %2091, float %2092), !dbg !56 + %2094 = tail call float @llvm.maxnum.f32(float %1806, float %2093), !dbg !60 + %2095 = insertelement <2 x float> poison, float %1806, i64 0, !dbg !61 + %2096 = insertelement <2 x float> %2095, float %2094, i64 1, !dbg !61 + %2097 = fmul <2 x float> %2096, splat (float 0x3FC0527DC0000000), !dbg !61 + %2098 = extractelement <2 x float> %2097, i64 0, !dbg !65 + %2099 = fsub float %1807, %2098, !dbg !63 + %2100 = fsub float %1808, %2098, !dbg !63 + %2101 = fsub float %1809, %2098, !dbg !63 + %2102 = fsub float %1810, %2098, !dbg !63 + %2103 = fsub float %1811, %2098, !dbg !63 + %2104 = fsub float %1812, %2098, !dbg !63 + %2105 = fsub float %1813, %2098, !dbg !63 + %2106 = fsub float %1814, %2098, !dbg !63 + %2107 = fsub float %1815, %2098, !dbg !63 + %2108 = fsub float %1816, %2098, !dbg !63 + %2109 = fsub float %1817, %2098, !dbg !63 + %2110 = fsub float %1818, %2098, !dbg !63 + %2111 = fsub float %1819, %2098, !dbg !63 + %2112 = fsub float %1820, %2098, !dbg !63 + %2113 = fsub float %1821, %2098, !dbg !63 + %2114 = fsub float %1822, %2098, !dbg !63 + %2115 = fsub float %1823, %2098, !dbg !63 + %2116 = fsub float %1824, %2098, !dbg !63 + %2117 = fsub float %1825, %2098, !dbg !63 + %2118 = fsub float %1826, %2098, !dbg !63 + %2119 = fsub float %1827, %2098, !dbg !63 + %2120 = fsub float %1828, %2098, !dbg !63 + %2121 = fsub float %1829, %2098, !dbg !63 + %2122 = fsub float %1830, %2098, !dbg !63 + %2123 = fsub float %1831, %2098, !dbg !63 + %2124 = fsub float %1832, %2098, !dbg !63 + %2125 = fsub float %1833, %2098, !dbg !63 + %2126 = fsub float %1834, %2098, !dbg !63 + %2127 = fsub float %1835, %2098, !dbg !63 + %2128 = fsub float %1836, %2098, !dbg !63 + %2129 = fsub float %1837, %2098, !dbg !63 + %2130 = fsub float %1838, %2098, !dbg !63 + %2131 = tail call float @llvm.amdgcn.exp2.f32(float %2099), !dbg !64 + %2132 = tail call float @llvm.amdgcn.exp2.f32(float %2100), !dbg !64 + %2133 = tail call float @llvm.amdgcn.exp2.f32(float %2101), !dbg !64 + %2134 = tail call float @llvm.amdgcn.exp2.f32(float %2102), !dbg !64 + %2135 = tail call float @llvm.amdgcn.exp2.f32(float %2103), !dbg !64 + %2136 = tail call float @llvm.amdgcn.exp2.f32(float %2104), !dbg !64 + %2137 = tail call float @llvm.amdgcn.exp2.f32(float %2105), !dbg !64 + %2138 = tail call float @llvm.amdgcn.exp2.f32(float %2106), !dbg !64 + %2139 = tail call float @llvm.amdgcn.exp2.f32(float %2107), !dbg !64 + %2140 = tail call float @llvm.amdgcn.exp2.f32(float %2108), !dbg !64 + %2141 = tail call float @llvm.amdgcn.exp2.f32(float %2109), !dbg !64 + %2142 = tail call float @llvm.amdgcn.exp2.f32(float %2110), !dbg !64 + %2143 = tail call float @llvm.amdgcn.exp2.f32(float %2111), !dbg !64 + %2144 = tail call float @llvm.amdgcn.exp2.f32(float %2112), !dbg !64 + %2145 = tail call float @llvm.amdgcn.exp2.f32(float %2113), !dbg !64 + %2146 = tail call float @llvm.amdgcn.exp2.f32(float %2114), !dbg !64 + %2147 = tail call float @llvm.amdgcn.exp2.f32(float %2115), !dbg !64 + %2148 = tail call float @llvm.amdgcn.exp2.f32(float %2116), !dbg !64 + %2149 = tail call float @llvm.amdgcn.exp2.f32(float %2117), !dbg !64 + %2150 = tail call float @llvm.amdgcn.exp2.f32(float %2118), !dbg !64 + %2151 = tail call float @llvm.amdgcn.exp2.f32(float %2119), !dbg !64 + %2152 = tail call float @llvm.amdgcn.exp2.f32(float %2120), !dbg !64 + %2153 = tail call float @llvm.amdgcn.exp2.f32(float %2121), !dbg !64 + %2154 = tail call float @llvm.amdgcn.exp2.f32(float %2122), !dbg !64 + %2155 = tail call float @llvm.amdgcn.exp2.f32(float %2123), !dbg !64 + %2156 = tail call float @llvm.amdgcn.exp2.f32(float %2124), !dbg !64 + %2157 = tail call float @llvm.amdgcn.exp2.f32(float %2125), !dbg !64 + %2158 = tail call float @llvm.amdgcn.exp2.f32(float %2126), !dbg !64 + %2159 = tail call float @llvm.amdgcn.exp2.f32(float %2127), !dbg !64 + %2160 = tail call float @llvm.amdgcn.exp2.f32(float %2128), !dbg !64 + %2161 = tail call float @llvm.amdgcn.exp2.f32(float %2129), !dbg !64 + %2162 = tail call float @llvm.amdgcn.exp2.f32(float %2130), !dbg !64 + %2163 = fsub float %1264, %2098, !dbg !65 + %2164 = tail call float @llvm.amdgcn.exp2.f32(float %2163), !dbg !66 + %2165 = fadd float %2131, %2132, !dbg !72 + %2166 = fadd float %2133, %2165, !dbg !72 + %2167 = fadd float %2134, %2166, !dbg !72 + %2168 = fadd float %2135, %2167, !dbg !72 + %2169 = fadd float %2136, %2168, !dbg !72 + %2170 = fadd float %2137, %2169, !dbg !72 + %2171 = fadd float %2138, %2170, !dbg !72 + %2172 = fadd float %2139, %2171, !dbg !72 + %2173 = fadd float %2140, %2172, !dbg !72 + %2174 = fadd float %2141, %2173, !dbg !72 + %2175 = fadd float %2142, %2174, !dbg !72 + %2176 = fadd float %2143, %2175, !dbg !72 + %2177 = fadd float %2144, %2176, !dbg !72 + %2178 = fadd float %2145, %2177, !dbg !72 + %2179 = fadd float %2146, %2178, !dbg !72 + %2180 = fadd float %2147, %2179, !dbg !72 + %2181 = fadd float %2148, %2180, !dbg !72 + %2182 = fadd float %2149, %2181, !dbg !72 + %2183 = fadd float %2150, %2182, !dbg !72 + %2184 = fadd float %2151, %2183, !dbg !72 + %2185 = fadd float %2152, %2184, !dbg !72 + %2186 = fadd float %2153, %2185, !dbg !72 + %2187 = fadd float %2154, %2186, !dbg !72 + %2188 = fadd float %2155, %2187, !dbg !72 + %2189 = fadd float %2156, %2188, !dbg !72 + %2190 = fadd float %2157, %2189, !dbg !72 + %2191 = fadd float %2158, %2190, !dbg !72 + %2192 = fadd float %2159, %2191, !dbg !72 + %2193 = fadd float %2160, %2192, !dbg !72 + %2194 = fadd float %2161, %2193, !dbg !72 + %2195 = fadd float %2162, %2194, !dbg !72 + %2196 = bitcast float %2195 to i32, !dbg !73 + %2197 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %2196, i32 %2196, i1 false, i1 false), !dbg !73 + %2198 = extractvalue { i32, i32 } %2197, 0, !dbg !73 + %2199 = extractvalue { i32, i32 } %2197, 1, !dbg !73 + %2200 = bitcast i32 %2198 to float, !dbg !73 + %2201 = bitcast i32 %2199 to float, !dbg !73 + %2202 = fadd float %2200, %2201, !dbg !72 + %2203 = insertelement <2 x float> poison, float %2164, i64 0, !dbg !74 + %2204 = fmul float %1601, %2164, !dbg !75 + %2205 = fadd float %2204, %2202, !dbg !76 + %2206 = insertelement <2 x float> poison, float %2131, i64 0, !dbg !77 + %2207 = insertelement <2 x float> %2206, float %2132, i64 1, !dbg !77 + %2208 = fptrunc <2 x float> %2207 to <2 x half>, !dbg !77 + %2209 = insertelement <2 x float> poison, float %2133, i64 0, !dbg !77 + %2210 = insertelement <2 x float> %2209, float %2134, i64 1, !dbg !77 + %2211 = fptrunc <2 x float> %2210 to <2 x half>, !dbg !77 + %2212 = insertelement <2 x float> poison, float %2135, i64 0, !dbg !77 + %2213 = insertelement <2 x float> %2212, float %2136, i64 1, !dbg !77 + %2214 = fptrunc <2 x float> %2213 to <2 x half>, !dbg !77 + %2215 = shufflevector <2 x half> %2214, <2 x half> poison, <8 x i32> + %2216 = insertelement <2 x float> poison, float %2137, i64 0, !dbg !77 + %2217 = insertelement <2 x float> %2216, float %2138, i64 1, !dbg !77 + %2218 = fptrunc <2 x float> %2217 to <2 x half>, !dbg !77 + %2219 = shufflevector <2 x half> %2218, <2 x half> poison, <8 x i32> + %2220 = insertelement <2 x float> poison, float %2139, i64 0, !dbg !77 + %2221 = insertelement <2 x float> %2220, float %2140, i64 1, !dbg !77 + %2222 = fptrunc <2 x float> %2221 to <2 x half>, !dbg !77 + %2223 = insertelement <2 x float> poison, float %2141, i64 0, !dbg !77 + %2224 = insertelement <2 x float> %2223, float %2142, i64 1, !dbg !77 + %2225 = fptrunc <2 x float> %2224 to <2 x half>, !dbg !77 + %2226 = insertelement <2 x float> poison, float %2143, i64 0, !dbg !77 + %2227 = insertelement <2 x float> %2226, float %2144, i64 1, !dbg !77 + %2228 = fptrunc <2 x float> %2227 to <2 x half>, !dbg !77 + %2229 = shufflevector <2 x half> %2228, <2 x half> poison, <8 x i32> + %2230 = insertelement <2 x float> poison, float %2145, i64 0, !dbg !77 + %2231 = insertelement <2 x float> %2230, float %2146, i64 1, !dbg !77 + %2232 = fptrunc <2 x float> %2231 to <2 x half>, !dbg !77 + %2233 = shufflevector <2 x half> %2232, <2 x half> poison, <8 x i32> + %2234 = insertelement <2 x float> poison, float %2147, i64 0, !dbg !77 + %2235 = insertelement <2 x float> %2234, float %2148, i64 1, !dbg !77 + %2236 = fptrunc <2 x float> %2235 to <2 x half>, !dbg !77 + %2237 = insertelement <2 x float> poison, float %2149, i64 0, !dbg !77 + %2238 = insertelement <2 x float> %2237, float %2150, i64 1, !dbg !77 + %2239 = fptrunc <2 x float> %2238 to <2 x half>, !dbg !77 + %2240 = insertelement <2 x float> poison, float %2151, i64 0, !dbg !77 + %2241 = insertelement <2 x float> %2240, float %2152, i64 1, !dbg !77 + %2242 = fptrunc <2 x float> %2241 to <2 x half>, !dbg !77 + %2243 = shufflevector <2 x half> %2242, <2 x half> poison, <8 x i32> + %2244 = insertelement <2 x float> poison, float %2153, i64 0, !dbg !77 + %2245 = insertelement <2 x float> %2244, float %2154, i64 1, !dbg !77 + %2246 = fptrunc <2 x float> %2245 to <2 x half>, !dbg !77 + %2247 = shufflevector <2 x half> %2246, <2 x half> poison, <8 x i32> + %2248 = insertelement <2 x float> poison, float %2155, i64 0, !dbg !77 + %2249 = insertelement <2 x float> %2248, float %2156, i64 1, !dbg !77 + %2250 = fptrunc <2 x float> %2249 to <2 x half>, !dbg !77 + %2251 = insertelement <2 x float> poison, float %2157, i64 0, !dbg !77 + %2252 = insertelement <2 x float> %2251, float %2158, i64 1, !dbg !77 + %2253 = fptrunc <2 x float> %2252 to <2 x half>, !dbg !77 + %2254 = insertelement <2 x float> poison, float %2159, i64 0, !dbg !77 + %2255 = insertelement <2 x float> %2254, float %2160, i64 1, !dbg !77 + %2256 = fptrunc <2 x float> %2255 to <2 x half>, !dbg !77 + %2257 = shufflevector <2 x half> %2256, <2 x half> poison, <8 x i32> + %2258 = insertelement <2 x float> poison, float %2161, i64 0, !dbg !77 + %2259 = insertelement <2 x float> %2258, float %2162, i64 1, !dbg !77 + %2260 = fptrunc <2 x float> %2259 to <2 x half>, !dbg !77 + %2261 = shufflevector <2 x half> %2260, <2 x half> poison, <8 x i32> + %2262 = fmul float %1952, 0x3FC0527DC0000000, !dbg !62 + %2263 = fmul float %1953, 0x3FC0527DC0000000, !dbg !62 + %2264 = fmul float %1954, 0x3FC0527DC0000000, !dbg !62 + %2265 = fmul float %1955, 0x3FC0527DC0000000, !dbg !62 + %2266 = fmul float %1956, 0x3FC0527DC0000000, !dbg !62 + %2267 = fmul float %1957, 0x3FC0527DC0000000, !dbg !62 + %2268 = fmul float %1958, 0x3FC0527DC0000000, !dbg !62 + %2269 = fmul float %1959, 0x3FC0527DC0000000, !dbg !62 + %2270 = fmul float %1960, 0x3FC0527DC0000000, !dbg !62 + %2271 = fmul float %1961, 0x3FC0527DC0000000, !dbg !62 + %2272 = fmul float %1962, 0x3FC0527DC0000000, !dbg !62 + %2273 = fmul float %1963, 0x3FC0527DC0000000, !dbg !62 + %2274 = fmul float %1964, 0x3FC0527DC0000000, !dbg !62 + %2275 = fmul float %1965, 0x3FC0527DC0000000, !dbg !62 + %2276 = fmul float %1966, 0x3FC0527DC0000000, !dbg !62 + %2277 = fmul float %1967, 0x3FC0527DC0000000, !dbg !62 + %2278 = fmul float %1976, 0x3FC0527DC0000000, !dbg !62 + %2279 = fmul float %1977, 0x3FC0527DC0000000, !dbg !62 + %2280 = fmul float %1978, 0x3FC0527DC0000000, !dbg !62 + %2281 = fmul float %1979, 0x3FC0527DC0000000, !dbg !62 + %2282 = fmul float %1980, 0x3FC0527DC0000000, !dbg !62 + %2283 = fmul float %1981, 0x3FC0527DC0000000, !dbg !62 + %2284 = fmul float %1982, 0x3FC0527DC0000000, !dbg !62 + %2285 = fmul float %1983, 0x3FC0527DC0000000, !dbg !62 + %2286 = fmul float %1984, 0x3FC0527DC0000000, !dbg !62 + %2287 = fmul float %1985, 0x3FC0527DC0000000, !dbg !62 + %2288 = fmul float %1986, 0x3FC0527DC0000000, !dbg !62 + %2289 = fmul float %1987, 0x3FC0527DC0000000, !dbg !62 + %2290 = fmul float %1988, 0x3FC0527DC0000000, !dbg !62 + %2291 = fmul float %1989, 0x3FC0527DC0000000, !dbg !62 + %2292 = fmul float %1990, 0x3FC0527DC0000000, !dbg !62 + %2293 = fmul float %1991, 0x3FC0527DC0000000, !dbg !62 + %2294 = extractelement <2 x float> %2097, i64 1, !dbg !65 + %2295 = fsub float %2262, %2294, !dbg !63 + %2296 = fsub float %2263, %2294, !dbg !63 + %2297 = fsub float %2264, %2294, !dbg !63 + %2298 = fsub float %2265, %2294, !dbg !63 + %2299 = fsub float %2266, %2294, !dbg !63 + %2300 = fsub float %2267, %2294, !dbg !63 + %2301 = fsub float %2268, %2294, !dbg !63 + %2302 = fsub float %2269, %2294, !dbg !63 + %2303 = fsub float %2270, %2294, !dbg !63 + %2304 = fsub float %2271, %2294, !dbg !63 + %2305 = fsub float %2272, %2294, !dbg !63 + %2306 = fsub float %2273, %2294, !dbg !63 + %2307 = fsub float %2274, %2294, !dbg !63 + %2308 = fsub float %2275, %2294, !dbg !63 + %2309 = fsub float %2276, %2294, !dbg !63 + %2310 = fsub float %2277, %2294, !dbg !63 + %2311 = fsub float %2278, %2294, !dbg !63 + %2312 = fsub float %2279, %2294, !dbg !63 + %2313 = fsub float %2280, %2294, !dbg !63 + %2314 = fsub float %2281, %2294, !dbg !63 + %2315 = fsub float %2282, %2294, !dbg !63 + %2316 = fsub float %2283, %2294, !dbg !63 + %2317 = fsub float %2284, %2294, !dbg !63 + %2318 = fsub float %2285, %2294, !dbg !63 + %2319 = fsub float %2286, %2294, !dbg !63 + %2320 = fsub float %2287, %2294, !dbg !63 + %2321 = fsub float %2288, %2294, !dbg !63 + %2322 = fsub float %2289, %2294, !dbg !63 + %2323 = fsub float %2290, %2294, !dbg !63 + %2324 = fsub float %2291, %2294, !dbg !63 + %2325 = fsub float %2292, %2294, !dbg !63 + %2326 = fsub float %2293, %2294, !dbg !63 + %2327 = tail call float @llvm.amdgcn.exp2.f32(float %2295), !dbg !64 + %2328 = tail call float @llvm.amdgcn.exp2.f32(float %2296), !dbg !64 + %2329 = tail call float @llvm.amdgcn.exp2.f32(float %2297), !dbg !64 + %2330 = tail call float @llvm.amdgcn.exp2.f32(float %2298), !dbg !64 + %2331 = tail call float @llvm.amdgcn.exp2.f32(float %2299), !dbg !64 + %2332 = tail call float @llvm.amdgcn.exp2.f32(float %2300), !dbg !64 + %2333 = tail call float @llvm.amdgcn.exp2.f32(float %2301), !dbg !64 + %2334 = tail call float @llvm.amdgcn.exp2.f32(float %2302), !dbg !64 + %2335 = tail call float @llvm.amdgcn.exp2.f32(float %2303), !dbg !64 + %2336 = tail call float @llvm.amdgcn.exp2.f32(float %2304), !dbg !64 + %2337 = tail call float @llvm.amdgcn.exp2.f32(float %2305), !dbg !64 + %2338 = tail call float @llvm.amdgcn.exp2.f32(float %2306), !dbg !64 + %2339 = tail call float @llvm.amdgcn.exp2.f32(float %2307), !dbg !64 + %2340 = tail call float @llvm.amdgcn.exp2.f32(float %2308), !dbg !64 + %2341 = tail call float @llvm.amdgcn.exp2.f32(float %2309), !dbg !64 + %2342 = tail call float @llvm.amdgcn.exp2.f32(float %2310), !dbg !64 + %2343 = tail call float @llvm.amdgcn.exp2.f32(float %2311), !dbg !64 + %2344 = tail call float @llvm.amdgcn.exp2.f32(float %2312), !dbg !64 + %2345 = tail call float @llvm.amdgcn.exp2.f32(float %2313), !dbg !64 + %2346 = tail call float @llvm.amdgcn.exp2.f32(float %2314), !dbg !64 + %2347 = tail call float @llvm.amdgcn.exp2.f32(float %2315), !dbg !64 + %2348 = tail call float @llvm.amdgcn.exp2.f32(float %2316), !dbg !64 + %2349 = tail call float @llvm.amdgcn.exp2.f32(float %2317), !dbg !64 + %2350 = tail call float @llvm.amdgcn.exp2.f32(float %2318), !dbg !64 + %2351 = tail call float @llvm.amdgcn.exp2.f32(float %2319), !dbg !64 + %2352 = tail call float @llvm.amdgcn.exp2.f32(float %2320), !dbg !64 + %2353 = tail call float @llvm.amdgcn.exp2.f32(float %2321), !dbg !64 + %2354 = tail call float @llvm.amdgcn.exp2.f32(float %2322), !dbg !64 + %2355 = tail call float @llvm.amdgcn.exp2.f32(float %2323), !dbg !64 + %2356 = tail call float @llvm.amdgcn.exp2.f32(float %2324), !dbg !64 + %2357 = tail call float @llvm.amdgcn.exp2.f32(float %2325), !dbg !64 + %2358 = tail call float @llvm.amdgcn.exp2.f32(float %2326), !dbg !64 + %shift = shufflevector <2 x float> %2097, <2 x float> poison, <2 x i32> , !dbg !65 + %2359 = fsub <2 x float> %2097, %shift, !dbg !65 + %2360 = extractelement <2 x float> %2359, i64 0, !dbg !65 + %2361 = tail call float @llvm.amdgcn.exp2.f32(float %2360), !dbg !66 + %2362 = shufflevector <2 x half> %2208, <2 x half> %2211, <8 x i32> , !dbg !81 + %2363 = shufflevector <8 x half> %2362, <8 x half> %2215, <8 x i32> , !dbg !81 + %2364 = shufflevector <8 x half> %2363, <8 x half> %2219, <8 x i32> , !dbg !81 + %2365 = shufflevector <2 x half> %2222, <2 x half> %2225, <8 x i32> , !dbg !81 + %2366 = shufflevector <8 x half> %2365, <8 x half> %2229, <8 x i32> , !dbg !81 + %2367 = shufflevector <8 x half> %2366, <8 x half> %2233, <8 x i32> , !dbg !81 + %2368 = shufflevector <2 x half> %2236, <2 x half> %2239, <8 x i32> , !dbg !81 + %2369 = shufflevector <8 x half> %2368, <8 x half> %2243, <8 x i32> , !dbg !81 + %2370 = shufflevector <8 x half> %2369, <8 x half> %2247, <8 x i32> , !dbg !81 + %2371 = shufflevector <2 x half> %2250, <2 x half> %2253, <8 x i32> , !dbg !81 + %2372 = shufflevector <8 x half> %2371, <8 x half> %2257, <8 x i32> , !dbg !81 + %2373 = shufflevector <8 x half> %2372, <8 x half> %2261, <8 x i32> , !dbg !81 + %2374 = shufflevector <4 x half> %1993, <4 x half> %1995, <8 x i32> , !dbg !81 + %2375 = shufflevector <4 x half> %1997, <4 x half> %1999, <8 x i32> , !dbg !81 + %2376 = shufflevector <4 x half> %2001, <4 x half> %2003, <8 x i32> , !dbg !81 + %2377 = shufflevector <4 x half> %2005, <4 x half> %2007, <8 x i32> , !dbg !81 + %2378 = shufflevector <4 x half> %2009, <4 x half> %2011, <8 x i32> , !dbg !81 + %2379 = shufflevector <4 x half> %2013, <4 x half> %2015, <8 x i32> , !dbg !81 + %2380 = shufflevector <4 x half> %2017, <4 x half> %2019, <8 x i32> , !dbg !81 + %2381 = shufflevector <4 x half> %2021, <4 x half> %2023, <8 x i32> , !dbg !81 + %2382 = shufflevector <4 x half> %2025, <4 x half> %2027, <8 x i32> , !dbg !81 + %2383 = shufflevector <4 x half> %2029, <4 x half> %2031, <8 x i32> , !dbg !81 + %2384 = shufflevector <4 x half> %2033, <4 x half> %2035, <8 x i32> , !dbg !81 + %2385 = shufflevector <4 x half> %2037, <4 x half> %2039, <8 x i32> , !dbg !81 + %2386 = shufflevector <4 x half> %2041, <4 x half> %2043, <8 x i32> , !dbg !81 + %2387 = shufflevector <4 x half> %2045, <4 x half> %2047, <8 x i32> , !dbg !81 + %2388 = shufflevector <4 x half> %2049, <4 x half> %2051, <8 x i32> , !dbg !81 + %2389 = shufflevector <4 x half> %2053, <4 x half> %2055, <8 x i32> , !dbg !81 + %2390 = shufflevector <2 x float> %2203, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2391 = fmul <16 x float> %1872, %2390, !dbg !81 + %2392 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2374, <8 x half> %2364, <16 x float> %2391, i32 0, i32 0, i32 0), !dbg !81 + %2393 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2375, <8 x half> %2367, <16 x float> %2392, i32 0, i32 0, i32 0), !dbg !81 + %2394 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2376, <8 x half> %2370, <16 x float> %2393, i32 0, i32 0, i32 0), !dbg !81 + %2395 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2377, <8 x half> %2373, <16 x float> %2394, i32 0, i32 0, i32 0), !dbg !81 + %2396 = shufflevector <2 x float> %2203, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2397 = fmul <16 x float> %2396, %1878, !dbg !81 + %2398 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2378, <8 x half> %2364, <16 x float> %2397, i32 0, i32 0, i32 0), !dbg !81 + %2399 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2379, <8 x half> %2367, <16 x float> %2398, i32 0, i32 0, i32 0), !dbg !81 + %2400 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2380, <8 x half> %2370, <16 x float> %2399, i32 0, i32 0, i32 0), !dbg !81 + %2401 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2381, <8 x half> %2373, <16 x float> %2400, i32 0, i32 0, i32 0), !dbg !81 + %2402 = shufflevector <2 x float> %2203, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2403 = fmul <16 x float> %2402, %1884, !dbg !81 + %2404 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2382, <8 x half> %2364, <16 x float> %2403, i32 0, i32 0, i32 0), !dbg !81 + %2405 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2383, <8 x half> %2367, <16 x float> %2404, i32 0, i32 0, i32 0), !dbg !81 + %2406 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2384, <8 x half> %2370, <16 x float> %2405, i32 0, i32 0, i32 0), !dbg !81 + %2407 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2385, <8 x half> %2373, <16 x float> %2406, i32 0, i32 0, i32 0), !dbg !81 + %2408 = shufflevector <2 x float> %2203, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2409 = fmul <16 x float> %2408, %1890, !dbg !81 + %2410 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2386, <8 x half> %2364, <16 x float> %2409, i32 0, i32 0, i32 0), !dbg !81 + %2411 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2387, <8 x half> %2367, <16 x float> %2410, i32 0, i32 0, i32 0), !dbg !81 + %2412 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2388, <8 x half> %2370, <16 x float> %2411, i32 0, i32 0, i32 0), !dbg !81 + %2413 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2389, <8 x half> %2373, <16 x float> %2412, i32 0, i32 0, i32 0), !dbg !81 + %2414 = fadd float %2327, %2328, !dbg !72 + %2415 = fadd float %2329, %2414, !dbg !72 + %2416 = fadd float %2330, %2415, !dbg !72 + %2417 = fadd float %2331, %2416, !dbg !72 + %2418 = fadd float %2332, %2417, !dbg !72 + %2419 = fadd float %2333, %2418, !dbg !72 + %2420 = fadd float %2334, %2419, !dbg !72 + %2421 = fadd float %2335, %2420, !dbg !72 + %2422 = fadd float %2336, %2421, !dbg !72 + %2423 = fadd float %2337, %2422, !dbg !72 + %2424 = fadd float %2338, %2423, !dbg !72 + %2425 = fadd float %2339, %2424, !dbg !72 + %2426 = fadd float %2340, %2425, !dbg !72 + %2427 = fadd float %2341, %2426, !dbg !72 + %2428 = fadd float %2342, %2427, !dbg !72 + %2429 = fadd float %2343, %2428, !dbg !72 + %2430 = fadd float %2344, %2429, !dbg !72 + %2431 = fadd float %2345, %2430, !dbg !72 + %2432 = fadd float %2346, %2431, !dbg !72 + %2433 = fadd float %2347, %2432, !dbg !72 + %2434 = fadd float %2348, %2433, !dbg !72 + %2435 = fadd float %2349, %2434, !dbg !72 + %2436 = fadd float %2350, %2435, !dbg !72 + %2437 = fadd float %2351, %2436, !dbg !72 + %2438 = fadd float %2352, %2437, !dbg !72 + %2439 = fadd float %2353, %2438, !dbg !72 + %2440 = fadd float %2354, %2439, !dbg !72 + %2441 = fadd float %2355, %2440, !dbg !72 + %2442 = fadd float %2356, %2441, !dbg !72 + %2443 = fadd float %2357, %2442, !dbg !72 + %2444 = fadd float %2358, %2443, !dbg !72 + %2445 = bitcast float %2444 to i32, !dbg !73 + %2446 = tail call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %2445, i32 %2445, i1 false, i1 false), !dbg !73 + %2447 = extractvalue { i32, i32 } %2446, 0, !dbg !73 + %2448 = extractvalue { i32, i32 } %2446, 1, !dbg !73 + %2449 = bitcast i32 %2447 to float, !dbg !73 + %2450 = bitcast i32 %2448 to float, !dbg !73 + %2451 = fadd float %2449, %2450, !dbg !72 + %2452 = insertelement <2 x float> poison, float %2361, i64 0, !dbg !74 + %2453 = fmul float %2205, %2361, !dbg !75 + %2454 = fadd float %2453, %2451, !dbg !76 + %2455 = insertelement <2 x float> poison, float %2327, i64 0, !dbg !77 + %2456 = insertelement <2 x float> %2455, float %2328, i64 1, !dbg !77 + %2457 = fptrunc <2 x float> %2456 to <2 x half>, !dbg !77 + %2458 = insertelement <2 x float> poison, float %2329, i64 0, !dbg !77 + %2459 = insertelement <2 x float> %2458, float %2330, i64 1, !dbg !77 + %2460 = fptrunc <2 x float> %2459 to <2 x half>, !dbg !77 + %2461 = insertelement <2 x float> poison, float %2331, i64 0, !dbg !77 + %2462 = insertelement <2 x float> %2461, float %2332, i64 1, !dbg !77 + %2463 = fptrunc <2 x float> %2462 to <2 x half>, !dbg !77 + %2464 = shufflevector <2 x half> %2463, <2 x half> poison, <8 x i32> + %2465 = insertelement <2 x float> poison, float %2333, i64 0, !dbg !77 + %2466 = insertelement <2 x float> %2465, float %2334, i64 1, !dbg !77 + %2467 = fptrunc <2 x float> %2466 to <2 x half>, !dbg !77 + %2468 = shufflevector <2 x half> %2467, <2 x half> poison, <8 x i32> + %2469 = insertelement <2 x float> poison, float %2335, i64 0, !dbg !77 + %2470 = insertelement <2 x float> %2469, float %2336, i64 1, !dbg !77 + %2471 = fptrunc <2 x float> %2470 to <2 x half>, !dbg !77 + %2472 = insertelement <2 x float> poison, float %2337, i64 0, !dbg !77 + %2473 = insertelement <2 x float> %2472, float %2338, i64 1, !dbg !77 + %2474 = fptrunc <2 x float> %2473 to <2 x half>, !dbg !77 + %2475 = insertelement <2 x float> poison, float %2339, i64 0, !dbg !77 + %2476 = insertelement <2 x float> %2475, float %2340, i64 1, !dbg !77 + %2477 = fptrunc <2 x float> %2476 to <2 x half>, !dbg !77 + %2478 = shufflevector <2 x half> %2477, <2 x half> poison, <8 x i32> + %2479 = insertelement <2 x float> poison, float %2341, i64 0, !dbg !77 + %2480 = insertelement <2 x float> %2479, float %2342, i64 1, !dbg !77 + %2481 = fptrunc <2 x float> %2480 to <2 x half>, !dbg !77 + %2482 = shufflevector <2 x half> %2481, <2 x half> poison, <8 x i32> + %2483 = insertelement <2 x float> poison, float %2343, i64 0, !dbg !77 + %2484 = insertelement <2 x float> %2483, float %2344, i64 1, !dbg !77 + %2485 = fptrunc <2 x float> %2484 to <2 x half>, !dbg !77 + %2486 = insertelement <2 x float> poison, float %2345, i64 0, !dbg !77 + %2487 = insertelement <2 x float> %2486, float %2346, i64 1, !dbg !77 + %2488 = fptrunc <2 x float> %2487 to <2 x half>, !dbg !77 + %2489 = insertelement <2 x float> poison, float %2347, i64 0, !dbg !77 + %2490 = insertelement <2 x float> %2489, float %2348, i64 1, !dbg !77 + %2491 = fptrunc <2 x float> %2490 to <2 x half>, !dbg !77 + %2492 = shufflevector <2 x half> %2491, <2 x half> poison, <8 x i32> + %2493 = insertelement <2 x float> poison, float %2349, i64 0, !dbg !77 + %2494 = insertelement <2 x float> %2493, float %2350, i64 1, !dbg !77 + %2495 = fptrunc <2 x float> %2494 to <2 x half>, !dbg !77 + %2496 = shufflevector <2 x half> %2495, <2 x half> poison, <8 x i32> + %2497 = insertelement <2 x float> poison, float %2351, i64 0, !dbg !77 + %2498 = insertelement <2 x float> %2497, float %2352, i64 1, !dbg !77 + %2499 = fptrunc <2 x float> %2498 to <2 x half>, !dbg !77 + %2500 = insertelement <2 x float> poison, float %2353, i64 0, !dbg !77 + %2501 = insertelement <2 x float> %2500, float %2354, i64 1, !dbg !77 + %2502 = fptrunc <2 x float> %2501 to <2 x half>, !dbg !77 + %2503 = insertelement <2 x float> poison, float %2355, i64 0, !dbg !77 + %2504 = insertelement <2 x float> %2503, float %2356, i64 1, !dbg !77 + %2505 = fptrunc <2 x float> %2504 to <2 x half>, !dbg !77 + %2506 = shufflevector <2 x half> %2505, <2 x half> poison, <8 x i32> + %2507 = insertelement <2 x float> poison, float %2357, i64 0, !dbg !77 + %2508 = insertelement <2 x float> %2507, float %2358, i64 1, !dbg !77 + %2509 = fptrunc <2 x float> %2508 to <2 x half>, !dbg !77 + %2510 = shufflevector <2 x half> %2509, <2 x half> poison, <8 x i32> + tail call void @llvm.amdgcn.s.waitcnt(i32 -49168), !dbg !47 + fence syncscope("workgroup") release, !dbg !47 + tail call void @llvm.amdgcn.s.barrier(), !dbg !47 + fence syncscope("workgroup") acquire, !dbg !47 + %2511 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1661, !dbg !47 + %2512 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2511), !dbg !47, !alias.scope !52, !noalias !48 + %2513 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1664, !dbg !47 + %2514 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2513), !dbg !47, !alias.scope !52, !noalias !48 + %2515 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1667, !dbg !47 + %2516 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2515), !dbg !47, !alias.scope !52, !noalias !48 + %2517 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1670, !dbg !47 + %2518 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2517), !dbg !47, !alias.scope !52, !noalias !48 + %2519 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1673, !dbg !47 + %2520 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2519), !dbg !47, !alias.scope !52, !noalias !48 + %2521 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1676, !dbg !47 + %2522 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2521), !dbg !47, !alias.scope !52, !noalias !48 + %2523 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1679, !dbg !47 + %2524 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2523), !dbg !47, !alias.scope !52, !noalias !48 + %2525 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1682, !dbg !47 + %2526 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2525), !dbg !47, !alias.scope !52, !noalias !48 + %2527 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1689, !dbg !47 + %2528 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2527), !dbg !47, !alias.scope !52, !noalias !48 + %2529 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1692, !dbg !47 + %2530 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2529), !dbg !47, !alias.scope !52, !noalias !48 + %2531 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1695, !dbg !47 + %2532 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2531), !dbg !47, !alias.scope !52, !noalias !48 + %2533 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1698, !dbg !47 + %2534 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2533), !dbg !47, !alias.scope !52, !noalias !48 + %2535 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1701, !dbg !47 + %2536 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2535), !dbg !47, !alias.scope !52, !noalias !48 + %2537 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1704, !dbg !47 + %2538 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2537), !dbg !47, !alias.scope !52, !noalias !48 + %2539 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1707, !dbg !47 + %2540 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2539), !dbg !47, !alias.scope !52, !noalias !48 + %2541 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1710, !dbg !47 + %2542 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2541), !dbg !47, !alias.scope !52, !noalias !48 + %2543 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1716, !dbg !47 + %2544 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2543), !dbg !47, !alias.scope !52, !noalias !48 + %2545 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1719, !dbg !47 + %2546 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2545), !dbg !47, !alias.scope !52, !noalias !48 + %2547 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1722, !dbg !47 + %2548 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2547), !dbg !47, !alias.scope !52, !noalias !48 + %2549 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1725, !dbg !47 + %2550 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2549), !dbg !47, !alias.scope !52, !noalias !48 + %2551 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1728, !dbg !47 + %2552 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2551), !dbg !47, !alias.scope !52, !noalias !48 + %2553 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1731, !dbg !47 + %2554 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2553), !dbg !47, !alias.scope !52, !noalias !48 + %2555 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1734, !dbg !47 + %2556 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2555), !dbg !47, !alias.scope !52, !noalias !48 + %2557 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1737, !dbg !47 + %2558 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2557), !dbg !47, !alias.scope !52, !noalias !48 + %2559 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1743, !dbg !47 + %2560 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %2559), !dbg !47, !alias.scope !52, !noalias !48 + %2561 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1746, !dbg !47 + %2562 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2561), !dbg !47, !alias.scope !52, !noalias !48 + %2563 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1749, !dbg !47 + %2564 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2563), !dbg !47, !alias.scope !52, !noalias !48 + %2565 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1752, !dbg !47 + %2566 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2565), !dbg !47, !alias.scope !52, !noalias !48 + %2567 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1755, !dbg !47 + %2568 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2567), !dbg !47, !alias.scope !52, !noalias !48 + %2569 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1758, !dbg !47 + %2570 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2569), !dbg !47, !alias.scope !52, !noalias !48 + %2571 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1761, !dbg !47 + %2572 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2571), !dbg !47, !alias.scope !52, !noalias !48 + %2573 = getelementptr inbounds nuw half, ptr addrspace(3) %1931, i32 %1764, !dbg !47 + %2574 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) nonnull %2573), !dbg !47, !alias.scope !52, !noalias !48 + %2575 = shufflevector <2 x half> %2457, <2 x half> %2460, <8 x i32> , !dbg !81 + %2576 = shufflevector <8 x half> %2575, <8 x half> %2464, <8 x i32> , !dbg !81 + %2577 = shufflevector <8 x half> %2576, <8 x half> %2468, <8 x i32> , !dbg !81 + %2578 = shufflevector <2 x half> %2471, <2 x half> %2474, <8 x i32> , !dbg !81 + %2579 = shufflevector <8 x half> %2578, <8 x half> %2478, <8 x i32> , !dbg !81 + %2580 = shufflevector <8 x half> %2579, <8 x half> %2482, <8 x i32> , !dbg !81 + %2581 = shufflevector <2 x half> %2485, <2 x half> %2488, <8 x i32> , !dbg !81 + %2582 = shufflevector <8 x half> %2581, <8 x half> %2492, <8 x i32> , !dbg !81 + %2583 = shufflevector <8 x half> %2582, <8 x half> %2496, <8 x i32> , !dbg !81 + %2584 = shufflevector <2 x half> %2499, <2 x half> %2502, <8 x i32> , !dbg !81 + %2585 = shufflevector <8 x half> %2584, <8 x half> %2506, <8 x i32> , !dbg !81 + %2586 = shufflevector <8 x half> %2585, <8 x half> %2510, <8 x i32> , !dbg !81 + %2587 = shufflevector <4 x half> %2512, <4 x half> %2514, <8 x i32> , !dbg !81 + %2588 = shufflevector <4 x half> %2516, <4 x half> %2518, <8 x i32> , !dbg !81 + %2589 = shufflevector <4 x half> %2520, <4 x half> %2522, <8 x i32> , !dbg !81 + %2590 = shufflevector <4 x half> %2524, <4 x half> %2526, <8 x i32> , !dbg !81 + %2591 = shufflevector <4 x half> %2528, <4 x half> %2530, <8 x i32> , !dbg !81 + %2592 = shufflevector <4 x half> %2532, <4 x half> %2534, <8 x i32> , !dbg !81 + %2593 = shufflevector <4 x half> %2536, <4 x half> %2538, <8 x i32> , !dbg !81 + %2594 = shufflevector <4 x half> %2540, <4 x half> %2542, <8 x i32> , !dbg !81 + %2595 = shufflevector <4 x half> %2544, <4 x half> %2546, <8 x i32> , !dbg !81 + %2596 = shufflevector <4 x half> %2548, <4 x half> %2550, <8 x i32> , !dbg !81 + %2597 = shufflevector <4 x half> %2552, <4 x half> %2554, <8 x i32> , !dbg !81 + %2598 = shufflevector <4 x half> %2556, <4 x half> %2558, <8 x i32> , !dbg !81 + %2599 = shufflevector <4 x half> %2560, <4 x half> %2562, <8 x i32> , !dbg !81 + %2600 = shufflevector <4 x half> %2564, <4 x half> %2566, <8 x i32> , !dbg !81 + %2601 = shufflevector <4 x half> %2568, <4 x half> %2570, <8 x i32> , !dbg !81 + %2602 = shufflevector <4 x half> %2572, <4 x half> %2574, <8 x i32> , !dbg !81 + %2603 = shufflevector <2 x float> %2452, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2604 = fmul <16 x float> %2395, %2603, !dbg !81 + %2605 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2587, <8 x half> %2577, <16 x float> %2604, i32 0, i32 0, i32 0), !dbg !81 + %2606 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2588, <8 x half> %2580, <16 x float> %2605, i32 0, i32 0, i32 0), !dbg !81 + %2607 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2589, <8 x half> %2583, <16 x float> %2606, i32 0, i32 0, i32 0), !dbg !81 + %2608 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2590, <8 x half> %2586, <16 x float> %2607, i32 0, i32 0, i32 0), !dbg !81 + %2609 = shufflevector <2 x float> %2452, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2610 = fmul <16 x float> %2609, %2401, !dbg !81 + %2611 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2591, <8 x half> %2577, <16 x float> %2610, i32 0, i32 0, i32 0), !dbg !81 + %2612 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2592, <8 x half> %2580, <16 x float> %2611, i32 0, i32 0, i32 0), !dbg !81 + %2613 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2593, <8 x half> %2583, <16 x float> %2612, i32 0, i32 0, i32 0), !dbg !81 + %2614 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2594, <8 x half> %2586, <16 x float> %2613, i32 0, i32 0, i32 0), !dbg !81 + %2615 = shufflevector <2 x float> %2452, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2616 = fmul <16 x float> %2615, %2407, !dbg !81 + %2617 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2595, <8 x half> %2577, <16 x float> %2616, i32 0, i32 0, i32 0), !dbg !81 + %2618 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2596, <8 x half> %2580, <16 x float> %2617, i32 0, i32 0, i32 0), !dbg !81 + %2619 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2597, <8 x half> %2583, <16 x float> %2618, i32 0, i32 0, i32 0), !dbg !81 + %2620 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2598, <8 x half> %2586, <16 x float> %2619, i32 0, i32 0, i32 0), !dbg !81 + %2621 = shufflevector <2 x float> %2452, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !81 + %2622 = fmul <16 x float> %2621, %2413, !dbg !81 + %2623 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2599, <8 x half> %2577, <16 x float> %2622, i32 0, i32 0, i32 0), !dbg !81 + %2624 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2600, <8 x half> %2580, <16 x float> %2623, i32 0, i32 0, i32 0), !dbg !81 + %2625 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2601, <8 x half> %2583, <16 x float> %2624, i32 0, i32 0, i32 0), !dbg !81 + %2626 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %2602, <8 x half> %2586, <16 x float> %2625, i32 0, i32 0, i32 0), !dbg !81 + fence syncscope("workgroup") release, !dbg !88 + tail call void @llvm.amdgcn.s.barrier(), !dbg !88 + fence syncscope("workgroup") acquire, !dbg !88 + br i1 %135, label %2640, label %2627, !dbg !89 + +2627: ; preds = %1512 + %2628 = sub i32 16384, %32, !dbg !90 + %2629 = icmp slt i32 %54, %2628, !dbg !91 + %2630 = tail call noundef float @llvm.log2.f32(float %2454), !dbg !92 + %2631 = fadd float %2094, %2630, !dbg !93 + %2632 = getelementptr inbounds nuw float, ptr addrspace(3) @global_smem, i32 %44, !dbg !94 + %2633 = insertelement <1 x float> poison, float %2631, i64 0, !dbg !94 + store <1 x float> %2633, ptr addrspace(3) %2632, align 4, !dbg !94 + fence syncscope("workgroup") release, !dbg !94 + tail call void @llvm.amdgcn.s.barrier(), !dbg !94 + fence syncscope("workgroup") acquire, !dbg !94 + %2634 = getelementptr inbounds nuw float, ptr addrspace(3) @global_smem, i32 %54, !dbg !94 + %2635 = load float, ptr addrspace(3) %2634, align 4, !dbg !94 + %2636 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %133, i16 0, i32 2147483646, i32 159744), !dbg !94 + %2637 = and i1 %40, %2629, !dbg !94 + %2638 = shl nuw nsw i32 %54, 2, !dbg !94 + %2639 = select i1 %2637, i32 %2638, i32 -2147483648, !dbg !94 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %2635, ptr addrspace(8) %2636, i32 %2639, i32 0, i32 0), !dbg !94 + br label %.critedge, !dbg !89 + +2640: ; preds = %1512 + %2641 = tail call noundef float @llvm.log2.f32(float %2454), !dbg !95 + %2642 = fadd float %2094, %2641, !dbg !96 + %2643 = getelementptr inbounds nuw float, ptr addrspace(3) @global_smem, i32 %44, !dbg !97 + %2644 = insertelement <1 x float> poison, float %2642, i64 0, !dbg !97 + store <1 x float> %2644, ptr addrspace(3) %2643, align 4, !dbg !97 + fence syncscope("workgroup") release, !dbg !97 + tail call void @llvm.amdgcn.s.barrier(), !dbg !97 + fence syncscope("workgroup") acquire, !dbg !97 + %2645 = getelementptr inbounds nuw float, ptr addrspace(3) @global_smem, i32 %54, !dbg !97 + %2646 = load float, ptr addrspace(3) %2645, align 4, !dbg !97 + %2647 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %133, i16 0, i32 2147483646, i32 159744), !dbg !97 + %2648 = shl nuw nsw i32 %54, 2, !dbg !97 + %2649 = select i1 %40, i32 %2648, i32 -2147483648, !dbg !97 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %2646, ptr addrspace(8) %2647, i32 %2649, i32 0, i32 0), !dbg !97 + br label %.critedge, !dbg !89 + +.critedge: ; preds = %2627, %2640 + %2650 = phi i1 [ %115, %2627 ], [ true, %2640 ], !dbg !98 + %2651 = tail call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.000000e+00, float %2454, i1 true), !dbg !99 + %2652 = extractvalue { float, i1 } %2651, 0, !dbg !99 + %2653 = tail call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.000000e+00, float %2454, i1 false), !dbg !99 + %2654 = extractvalue { float, i1 } %2653, 0, !dbg !99 + %2655 = tail call float @llvm.amdgcn.rcp.f32(float %2654), !dbg !99 + %2656 = fmul float %2652, %2655, !dbg !99 + %2657 = extractvalue { float, i1 } %2651, 1, !dbg !99 + %2658 = tail call float @llvm.amdgcn.div.fmas.f32(float 0.000000e+00, float 0.000000e+00, float %2656, i1 %2657), !dbg !99 + %2659 = tail call float @llvm.amdgcn.div.fixup.f32(float %2658, float %2454, float 1.000000e+00), !dbg !99 + %2660 = insertelement <2 x float> poison, float %2659, i64 0, !dbg !100 + %2661 = shufflevector <2 x float> %2660, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !100 + %2662 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2663 = fmul <2 x float> %2661, %2662, !dbg !100 + %2664 = fptrunc <2 x float> %2663 to <2 x half>, !dbg !101 + %2665 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2666 = fmul <2 x float> %2661, %2665, !dbg !100 + %2667 = fptrunc <2 x float> %2666 to <2 x half>, !dbg !101 + %2668 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2669 = fmul <2 x float> %2661, %2668, !dbg !100 + %2670 = fptrunc <2 x float> %2669 to <2 x half>, !dbg !101 + %2671 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2672 = fmul <2 x float> %2661, %2671, !dbg !100 + %2673 = fptrunc <2 x float> %2672 to <2 x half>, !dbg !101 + %2674 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2675 = fmul <2 x float> %2661, %2674, !dbg !100 + %2676 = fptrunc <2 x float> %2675 to <2 x half>, !dbg !101 + %2677 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2678 = fmul <2 x float> %2661, %2677, !dbg !100 + %2679 = fptrunc <2 x float> %2678 to <2 x half>, !dbg !101 + %2680 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2681 = fmul <2 x float> %2661, %2680, !dbg !100 + %2682 = fptrunc <2 x float> %2681 to <2 x half>, !dbg !101 + %2683 = shufflevector <16 x float> %2626, <16 x float> poison, <2 x i32> , !dbg !100 + %2684 = fmul <2 x float> %2661, %2683, !dbg !100 + %2685 = fptrunc <2 x float> %2684 to <2 x half>, !dbg !101 + %2686 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2687 = fmul <2 x float> %2661, %2686, !dbg !100 + %2688 = fptrunc <2 x float> %2687 to <2 x half>, !dbg !101 + %2689 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2690 = fmul <2 x float> %2661, %2689, !dbg !100 + %2691 = fptrunc <2 x float> %2690 to <2 x half>, !dbg !101 + %2692 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2693 = fmul <2 x float> %2661, %2692, !dbg !100 + %2694 = fptrunc <2 x float> %2693 to <2 x half>, !dbg !101 + %2695 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2696 = fmul <2 x float> %2661, %2695, !dbg !100 + %2697 = fptrunc <2 x float> %2696 to <2 x half>, !dbg !101 + %2698 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2699 = fmul <2 x float> %2661, %2698, !dbg !100 + %2700 = fptrunc <2 x float> %2699 to <2 x half>, !dbg !101 + %2701 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2702 = fmul <2 x float> %2661, %2701, !dbg !100 + %2703 = fptrunc <2 x float> %2702 to <2 x half>, !dbg !101 + %2704 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2705 = fmul <2 x float> %2661, %2704, !dbg !100 + %2706 = fptrunc <2 x float> %2705 to <2 x half>, !dbg !101 + %2707 = shufflevector <16 x float> %2620, <16 x float> poison, <2 x i32> , !dbg !100 + %2708 = fmul <2 x float> %2661, %2707, !dbg !100 + %2709 = fptrunc <2 x float> %2708 to <2 x half>, !dbg !101 + %2710 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2711 = fmul <2 x float> %2661, %2710, !dbg !100 + %2712 = fptrunc <2 x float> %2711 to <2 x half>, !dbg !101 + %2713 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2714 = fmul <2 x float> %2661, %2713, !dbg !100 + %2715 = fptrunc <2 x float> %2714 to <2 x half>, !dbg !101 + %2716 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2717 = fmul <2 x float> %2661, %2716, !dbg !100 + %2718 = fptrunc <2 x float> %2717 to <2 x half>, !dbg !101 + %2719 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2720 = fmul <2 x float> %2661, %2719, !dbg !100 + %2721 = fptrunc <2 x float> %2720 to <2 x half>, !dbg !101 + %2722 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2723 = fmul <2 x float> %2661, %2722, !dbg !100 + %2724 = fptrunc <2 x float> %2723 to <2 x half>, !dbg !101 + %2725 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2726 = fmul <2 x float> %2661, %2725, !dbg !100 + %2727 = fptrunc <2 x float> %2726 to <2 x half>, !dbg !101 + %2728 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2729 = fmul <2 x float> %2661, %2728, !dbg !100 + %2730 = fptrunc <2 x float> %2729 to <2 x half>, !dbg !101 + %2731 = shufflevector <16 x float> %2614, <16 x float> poison, <2 x i32> , !dbg !100 + %2732 = fmul <2 x float> %2661, %2731, !dbg !100 + %2733 = fptrunc <2 x float> %2732 to <2 x half>, !dbg !101 + %2734 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2735 = fmul <2 x float> %2661, %2734, !dbg !100 + %2736 = fptrunc <2 x float> %2735 to <2 x half>, !dbg !101 + %2737 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2738 = fmul <2 x float> %2661, %2737, !dbg !100 + %2739 = fptrunc <2 x float> %2738 to <2 x half>, !dbg !101 + %2740 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2741 = fmul <2 x float> %2661, %2740, !dbg !100 + %2742 = fptrunc <2 x float> %2741 to <2 x half>, !dbg !101 + %2743 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2744 = fmul <2 x float> %2661, %2743, !dbg !100 + %2745 = fptrunc <2 x float> %2744 to <2 x half>, !dbg !101 + %2746 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2747 = fmul <2 x float> %2661, %2746, !dbg !100 + %2748 = fptrunc <2 x float> %2747 to <2 x half>, !dbg !101 + %2749 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2750 = fmul <2 x float> %2661, %2749, !dbg !100 + %2751 = fptrunc <2 x float> %2750 to <2 x half>, !dbg !101 + %2752 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2753 = fmul <2 x float> %2661, %2752, !dbg !100 + %2754 = fptrunc <2 x float> %2753 to <2 x half>, !dbg !101 + %2755 = shufflevector <16 x float> %2608, <16 x float> poison, <2 x i32> , !dbg !100 + %2756 = fmul <2 x float> %2661, %2755, !dbg !100 + %2757 = fptrunc <2 x float> %2756 to <2 x half>, !dbg !101 + %2758 = or disjoint i32 %82, 120, !dbg !21 + %2759 = mul i32 %16, %44, !dbg !102 + %2760 = add i32 %2758, %2759, !dbg !103 + %2761 = or disjoint i32 %82, 112, !dbg !21 + %2762 = add i32 %2761, %2759, !dbg !103 + %2763 = or disjoint i32 %82, 104, !dbg !21 + %2764 = add i32 %2763, %2759, !dbg !103 + %2765 = or disjoint i32 %82, 96, !dbg !21 + %2766 = add i32 %2765, %2759, !dbg !103 + %2767 = or disjoint i32 %82, 88, !dbg !21 + %2768 = add i32 %2767, %2759, !dbg !103 + %2769 = or disjoint i32 %82, 80, !dbg !21 + %2770 = add i32 %2769, %2759, !dbg !103 + %2771 = or disjoint i32 %82, 72, !dbg !21 + %2772 = add i32 %2771, %2759, !dbg !103 + %2773 = or disjoint i32 %82, 64, !dbg !21 + %2774 = add i32 %2773, %2759, !dbg !103 + %2775 = or disjoint i32 %82, 56, !dbg !21 + %2776 = add i32 %2775, %2759, !dbg !103 + %2777 = or disjoint i32 %82, 48, !dbg !21 + %2778 = add i32 %2777, %2759, !dbg !103 + %2779 = or disjoint i32 %82, 40, !dbg !21 + %2780 = add i32 %2779, %2759, !dbg !103 + %2781 = or disjoint i32 %82, 32, !dbg !21 + %2782 = add i32 %2781, %2759, !dbg !103 + %2783 = or disjoint i32 %82, 24, !dbg !21 + %2784 = add i32 %2783, %2759, !dbg !103 + %2785 = or disjoint i32 %82, 16, !dbg !21 + %2786 = add i32 %2785, %2759, !dbg !103 + %2787 = or disjoint i32 %82, 8, !dbg !21 + %2788 = add i32 %2787, %2759, !dbg !103 + %2789 = add i32 %2759, %82, !dbg !103 + %2790 = mul i32 %14, %31, !dbg !104 + %2791 = sext i32 %2790 to i64, !dbg !105 + %2792 = getelementptr half, ptr addrspace(1) %4, i64 %2791, !dbg !105 + %2793 = mul i32 %15, %30, !dbg !106 + %2794 = sext i32 %2793 to i64, !dbg !107 + %2795 = getelementptr half, ptr addrspace(1) %2792, i64 %2794, !dbg !107 + %2796 = mul i32 %16, %32, !dbg !102 + %2797 = sext i32 %2796 to i64, !dbg !102 + %2798 = getelementptr half, ptr addrspace(1) %2795, i64 %2797, !dbg !102 + %2799 = trunc i32 %16 to i16, !dbg !108 + %2800 = and i16 %2799, 16383, !dbg !108 + %2801 = or disjoint i16 %2800, 16384, !dbg !108 + %2802 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %2798, i16 %2801, i32 2147483646, i32 159744), !dbg !108 + %2803 = shufflevector <2 x half> %2757, <2 x half> %2754, <4 x i32> , !dbg !108 + %2804 = bitcast <4 x half> %2803 to <2 x i32>, !dbg !108 + %2805 = shl i32 %2789, 1, !dbg !108 + %2806 = select i1 %2650, i32 %2805, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2804, ptr addrspace(8) %2802, i32 %2806, i32 0, i32 0), !dbg !108 + %2807 = shufflevector <2 x half> %2751, <2 x half> %2748, <4 x i32> , !dbg !108 + %2808 = bitcast <4 x half> %2807 to <2 x i32>, !dbg !108 + %2809 = shl i32 %2788, 1, !dbg !108 + %2810 = select i1 %2650, i32 %2809, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2808, ptr addrspace(8) %2802, i32 %2810, i32 0, i32 0), !dbg !108 + %2811 = shufflevector <2 x half> %2745, <2 x half> %2742, <4 x i32> , !dbg !108 + %2812 = bitcast <4 x half> %2811 to <2 x i32>, !dbg !108 + %2813 = shl i32 %2786, 1, !dbg !108 + %2814 = select i1 %2650, i32 %2813, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2812, ptr addrspace(8) %2802, i32 %2814, i32 0, i32 0), !dbg !108 + %2815 = shufflevector <2 x half> %2739, <2 x half> %2736, <4 x i32> , !dbg !108 + %2816 = bitcast <4 x half> %2815 to <2 x i32>, !dbg !108 + %2817 = shl i32 %2784, 1, !dbg !108 + %2818 = select i1 %2650, i32 %2817, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2816, ptr addrspace(8) %2802, i32 %2818, i32 0, i32 0), !dbg !108 + %2819 = shufflevector <2 x half> %2733, <2 x half> %2730, <4 x i32> , !dbg !108 + %2820 = bitcast <4 x half> %2819 to <2 x i32>, !dbg !108 + %2821 = shl i32 %2782, 1, !dbg !108 + %2822 = select i1 %2650, i32 %2821, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2820, ptr addrspace(8) %2802, i32 %2822, i32 0, i32 0), !dbg !108 + %2823 = shufflevector <2 x half> %2727, <2 x half> %2724, <4 x i32> , !dbg !108 + %2824 = bitcast <4 x half> %2823 to <2 x i32>, !dbg !108 + %2825 = shl i32 %2780, 1, !dbg !108 + %2826 = select i1 %2650, i32 %2825, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2824, ptr addrspace(8) %2802, i32 %2826, i32 0, i32 0), !dbg !108 + %2827 = shufflevector <2 x half> %2721, <2 x half> %2718, <4 x i32> , !dbg !108 + %2828 = bitcast <4 x half> %2827 to <2 x i32>, !dbg !108 + %2829 = shl i32 %2778, 1, !dbg !108 + %2830 = select i1 %2650, i32 %2829, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2828, ptr addrspace(8) %2802, i32 %2830, i32 0, i32 0), !dbg !108 + %2831 = shufflevector <2 x half> %2715, <2 x half> %2712, <4 x i32> , !dbg !108 + %2832 = bitcast <4 x half> %2831 to <2 x i32>, !dbg !108 + %2833 = shl i32 %2776, 1, !dbg !108 + %2834 = select i1 %2650, i32 %2833, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2832, ptr addrspace(8) %2802, i32 %2834, i32 0, i32 0), !dbg !108 + %2835 = shufflevector <2 x half> %2709, <2 x half> %2706, <4 x i32> , !dbg !108 + %2836 = bitcast <4 x half> %2835 to <2 x i32>, !dbg !108 + %2837 = shl i32 %2774, 1, !dbg !108 + %2838 = select i1 %2650, i32 %2837, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2836, ptr addrspace(8) %2802, i32 %2838, i32 0, i32 0), !dbg !108 + %2839 = shufflevector <2 x half> %2703, <2 x half> %2700, <4 x i32> , !dbg !108 + %2840 = bitcast <4 x half> %2839 to <2 x i32>, !dbg !108 + %2841 = shl i32 %2772, 1, !dbg !108 + %2842 = select i1 %2650, i32 %2841, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2840, ptr addrspace(8) %2802, i32 %2842, i32 0, i32 0), !dbg !108 + %2843 = shufflevector <2 x half> %2697, <2 x half> %2694, <4 x i32> , !dbg !108 + %2844 = bitcast <4 x half> %2843 to <2 x i32>, !dbg !108 + %2845 = shl i32 %2770, 1, !dbg !108 + %2846 = select i1 %2650, i32 %2845, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2844, ptr addrspace(8) %2802, i32 %2846, i32 0, i32 0), !dbg !108 + %2847 = shufflevector <2 x half> %2691, <2 x half> %2688, <4 x i32> , !dbg !108 + %2848 = bitcast <4 x half> %2847 to <2 x i32>, !dbg !108 + %2849 = shl i32 %2768, 1, !dbg !108 + %2850 = select i1 %2650, i32 %2849, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2848, ptr addrspace(8) %2802, i32 %2850, i32 0, i32 0), !dbg !108 + %2851 = shufflevector <2 x half> %2685, <2 x half> %2682, <4 x i32> , !dbg !108 + %2852 = bitcast <4 x half> %2851 to <2 x i32>, !dbg !108 + %2853 = shl i32 %2766, 1, !dbg !108 + %2854 = select i1 %2650, i32 %2853, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2852, ptr addrspace(8) %2802, i32 %2854, i32 0, i32 0), !dbg !108 + %2855 = shufflevector <2 x half> %2679, <2 x half> %2676, <4 x i32> , !dbg !108 + %2856 = bitcast <4 x half> %2855 to <2 x i32>, !dbg !108 + %2857 = shl i32 %2764, 1, !dbg !108 + %2858 = select i1 %2650, i32 %2857, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2856, ptr addrspace(8) %2802, i32 %2858, i32 0, i32 0), !dbg !108 + %2859 = shufflevector <2 x half> %2673, <2 x half> %2670, <4 x i32> , !dbg !108 + %2860 = bitcast <4 x half> %2859 to <2 x i32>, !dbg !108 + %2861 = shl i32 %2762, 1, !dbg !108 + %2862 = select i1 %2650, i32 %2861, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2860, ptr addrspace(8) %2802, i32 %2862, i32 0, i32 0), !dbg !108 + %2863 = shufflevector <2 x half> %2667, <2 x half> %2664, <4 x i32> , !dbg !108 + %2864 = bitcast <4 x half> %2863 to <2 x i32>, !dbg !108 + %2865 = shl i32 %2760, 1, !dbg !108 + %2866 = select i1 %2650, i32 %2865, i32 -2147483648, !dbg !108 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2864, ptr addrspace(8) %2802, i32 %2866, i32 0, i32 0), !dbg !108 + ret void, !dbg !109 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workgroup.id.x() #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workgroup.id.y() #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workgroup.id.z() #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workitem.id.x() #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) readnone, i16, i32, i32) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) readonly captures(none), i32, i32, i32 immarg) #2 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.s.barrier() #3 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(none) +declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #4 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(none) +declare i64 @llvm.amdgcn.ballot.i64(i1) #4 + +; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) readonly captures(none), ptr addrspace(3) writeonly captures(none), i32 immarg, i32, i32, i32 immarg, i32 immarg) #5 + +; Function Attrs: mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.s.waitcnt(i32 immarg) #6 + +; Function Attrs: convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #0 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(none) +declare { i32, i32 } @llvm.amdgcn.permlane32.swap(i32, i32, i1 immarg, i1 immarg) #4 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(read) +declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) captures(none)) #8 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1 immarg) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.rcp.f32(float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #9 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #9 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn +declare void @llvm.amdgcn.s.setprio(i16 immarg) #10 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.sched.barrier(i32 immarg) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.log2.f32(float) #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nofree norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { convergent mustprogress nocallback nofree nounwind willreturn } +attributes #4 = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) } +attributes #5 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #6 = { mustprogress nocallback nofree nounwind willreturn } +attributes #7 = { convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #8 = { convergent mustprogress nocallback nofree nounwind willreturn memory(read) } +attributes #9 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #10 = { mustprogress nocallback nofree nosync nounwind willreturn } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "flash-attention.py", directory: "/var/lib/jenkins/OAI-triton/python/../fa") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 0} +!6 = !DISubprogram(name: "llvm.amdgcn.exp2.f32", linkageName: "llvm.amdgcn.exp2.f32", scope: !1, file: !1, line: 677, type: !7, scopeLine: 677, spFlags: DISPFlagOptimized) +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "attn_fwd", linkageName: "attn_fwd", scope: !1, file: !1, line: 435, type: !7, scopeLine: 435, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 488, column: 36, scope: !9) +!11 = !DILocation(line: 489, column: 36, scope: !9) +!12 = !DILocation(line: 490, column: 34, scope: !9) +!13 = !DILocation(line: 492, column: 27, scope: !9) +!14 = !DILocation(line: 492, column: 50, scope: !9) +!15 = !DILocation(line: 492, column: 37, scope: !9) +!16 = !DILocation(line: 571, column: 39, scope: !9) +!17 = !DILocation(line: 571, column: 31, scope: !9) +!18 = !DILocation(line: 571, column: 61, scope: !9) +!19 = !DILocation(line: 571, column: 51, scope: !9) +!20 = !DILocation(line: 572, column: 36, scope: !9) +!21 = !DILocation(line: 572, column: 73, scope: !9) +!22 = !DILocation(line: 572, column: 66, scope: !9) +!23 = !DILocation(line: 573, column: 39, scope: !9) +!24 = !DILocation(line: 573, column: 31, scope: !9) +!25 = !DILocation(line: 573, column: 61, scope: !9) +!26 = !DILocation(line: 573, column: 51, scope: !9) +!27 = !DILocation(line: 574, column: 66, scope: !9) +!28 = !DILocation(line: 575, column: 39, scope: !9) +!29 = !DILocation(line: 575, column: 31, scope: !9) +!30 = !DILocation(line: 575, column: 61, scope: !9) +!31 = !DILocation(line: 575, column: 51, scope: !9) +!32 = !DILocation(line: 576, column: 36, scope: !9) +!33 = !DILocation(line: 576, column: 66, scope: !9) +!34 = !DILocation(line: 622, column: 48, scope: !9) +!35 = !DILocation(line: 342, column: 28, scope: !36, inlinedAt: !37) +!36 = distinct !DILexicalBlockFile(scope: !9, file: !1, discriminator: 0) +!37 = !DILocation(line: 677, column: 52, scope: !9) +!38 = !DILocation(line: 343, column: 28, scope: !36, inlinedAt: !37) +!39 = !DILocation(line: 731, column: 42, scope: !9) +!40 = !DILocation(line: 731, column: 29, scope: !9) +!41 = !DILocation(line: 731, column: 68, scope: !9) +!42 = !DILocation(line: 731, column: 58, scope: !9) +!43 = !DILocation(line: 731, column: 84, scope: !9) +!44 = !DILocation(line: 734, column: 44, scope: !9) +!45 = !DILocation(line: 735, column: 35, scope: !9) +!46 = !DILocation(line: 625, column: 28, scope: !9) +!47 = !DILocation(line: 185, column: 25, scope: !36, inlinedAt: !37) +!48 = !{!49} +!49 = !{!"amdgpu.AsyncCopies", !50, !"Scope containing all AsyncCopyGlobalToLocal and BufferLoadToLocal ops"} +!50 = !{!"amdgpu.AsyncOps", !"Domain to hold alias scopes to specify aliasing information between AsyncCopyGlobalToLocal, BufferLoadToLocal and LocalLoad ops"} +!51 = !DILocation(line: 342, column: 18, scope: !36, inlinedAt: !37) +!52 = !{!53} +!53 = !{!"amdgpu.LocalLoads", !50, !"Scope containing all LocalLoad ops"} +!54 = !DILocation(line: 285, column: 28, scope: !36, inlinedAt: !37) +!55 = !DILocation(line: 343, column: 18, scope: !36, inlinedAt: !37) +!56 = !DILocation(line: 167, column: 27, scope: !57, inlinedAt: !37) +!57 = distinct !DILexicalBlockFile(scope: !9, file: !58, discriminator: 0) +!58 = !DIFile(filename: "standard.py", directory: "/var/lib/jenkins/OAI-triton/python/triton/language") +!59 = !DILocation(line: 188, column: 40, scope: !57, inlinedAt: !37) +!60 = !DILocation(line: 304, column: 31, scope: !36, inlinedAt: !37) +!61 = !DILocation(line: 305, column: 29, scope: !36, inlinedAt: !37) +!62 = !DILocation(line: 306, column: 18, scope: !36, inlinedAt: !37) +!63 = !DILocation(line: 306, column: 29, scope: !36, inlinedAt: !37) +!64 = !DILocation(line: 307, column: 25, scope: !36, inlinedAt: !37) +!65 = !DILocation(line: 320, column: 46, scope: !36, inlinedAt: !37) +!66 = !DILocation(line: 320, column: 29, scope: !36, inlinedAt: !37) +!67 = !DILocation(line: 188, column: 7, scope: !9) +!68 = !DILocation(line: 191, column: 14, scope: !9) +!69 = !DILocation(line: 194, column: 7, scope: !9) +!70 = !DILocation(line: 248, column: 47, scope: !36, inlinedAt: !37) +!71 = !DILocation(line: 203, column: 9, scope: !9) +!72 = !DILocation(line: 260, column: 15, scope: !57, inlinedAt: !37) +!73 = !DILocation(line: 290, column: 36, scope: !57, inlinedAt: !37) +!74 = !DILocation(line: 321, column: 20, scope: !36, inlinedAt: !37) +!75 = !DILocation(line: 325, column: 20, scope: !36, inlinedAt: !37) +!76 = !DILocation(line: 325, column: 28, scope: !36, inlinedAt: !37) +!77 = !DILocation(line: 340, column: 31, scope: !36, inlinedAt: !37) +!78 = !DILocation(line: 223, column: 9, scope: !9) +!79 = !DILocation(line: 225, column: 9, scope: !9) +!80 = !DILocation(line: 239, column: 9, scope: !9) +!81 = !DILocation(line: 340, column: 51, scope: !36, inlinedAt: !37) +!82 = !DILocation(line: 246, column: 9, scope: !9) +!83 = !DILocation(line: 320, column: 35, scope: !36, inlinedAt: !37) +!84 = !DILocation(line: 268, column: 9, scope: !9) +!85 = !DILocation(line: 270, column: 9, scope: !9) +!86 = !DILocation(line: 279, column: 9, scope: !9) +!87 = !DILocation(line: 283, column: 7, scope: !9) +!88 = !DILocation(line: 681, column: 16, scope: !9) +!89 = !DILocation(line: 735, column: 19, scope: !9) +!90 = !DILocation(line: 736, column: 62, scope: !9) +!91 = !DILocation(line: 737, column: 58, scope: !9) +!92 = !DILocation(line: 738, column: 56, scope: !9) +!93 = !DILocation(line: 738, column: 43, scope: !9) +!94 = !DILocation(line: 738, column: 37, scope: !9) +!95 = !DILocation(line: 740, column: 56, scope: !9) +!96 = !DILocation(line: 740, column: 43, scope: !9) +!97 = !DILocation(line: 740, column: 37, scope: !9) +!98 = !DILocation(line: 746, column: 19, scope: !9) +!99 = !DILocation(line: 710, column: 30, scope: !9) +!100 = !DILocation(line: 711, column: 28, scope: !9) +!101 = !DILocation(line: 722, column: 29, scope: !9) +!102 = !DILocation(line: 744, column: 36, scope: !9) +!103 = !DILocation(line: 744, column: 66, scope: !9) +!104 = !DILocation(line: 743, column: 41, scope: !9) +!105 = !DILocation(line: 743, column: 33, scope: !9) +!106 = !DILocation(line: 743, column: 63, scope: !9) +!107 = !DILocation(line: 743, column: 53, scope: !9) +!108 = !DILocation(line: 750, column: 33, scope: !9) +!109 = !DILocation(line: 481, column: 4, scope: !9)