diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 153b14ce60507..80ee5dcbc0ef0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -94,9 +94,8 @@ class InstructionRule { std::optional> Cache; public: - virtual bool - apply(const SUnit *, const ArrayRef, - SmallVectorImpl &) { + virtual bool apply(const SUnit *, const ArrayRef, + SmallVectorImpl &) { return true; }; @@ -696,6 +695,76 @@ bool PipelineSolver::solveExact() { return FinishedExploring; } +// Implement a IGLP scheduling strategy. +class IGLPStrategy { +protected: + ScheduleDAGInstrs *DAG; + + const SIInstrInfo *TII; + +public: + /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. + virtual bool applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) = 0; + + // Returns true if this strategy should be applied to a ScheduleDAG. + virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, + AMDGPU::SchedulingPhase Phase) = 0; + + bool IsBottomUp = true; + + IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : DAG(DAG), TII(TII) {} + + virtual ~IGLPStrategy() = default; +}; + +class MaxsOpt final : public IGLPStrategy { +private: +public: + bool applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, + AMDGPU::SchedulingPhase Phase) override { + return true; + } + + MaxsOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = true; + } +}; + +bool MaxsOpt::applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) { + // Count the number of MFMA instructions. + unsigned MFMACount = 0; + for (const MachineInstr &I : *DAG) + if (TII->isMFMAorWMMA(I)) + ++MFMACount; + + const unsigned PipelineSyncID = 0; + SchedGroup *SG = nullptr; + for (unsigned I = 0; I < MFMACount * 3; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + return true; +} + template void PipelineSolver::greedyFind( std::vector> &AddedEdges, T I, T E) { @@ -815,33 +884,8 @@ enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, MFMASmallGemmSingleWaveOptID = 1, MFMAExpInterleaveID = 2, - MFMAExpSimpleInterleaveID = 3 -}; - -// Implement a IGLP scheduling strategy. -class IGLPStrategy { -protected: - ScheduleDAGInstrs *DAG; - - const SIInstrInfo *TII; - -public: - /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. - virtual bool applyIGLPStrategy( - DenseMap &SyncedInstrs, - DenseMap> &SyncedSchedGroups, - AMDGPU::SchedulingPhase Phase) = 0; - - // Returns true if this strategy should be applied to a ScheduleDAG. - virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, - AMDGPU::SchedulingPhase Phase) = 0; - - bool IsBottomUp = true; - - IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : DAG(DAG), TII(TII) {} - - virtual ~IGLPStrategy() = default; + MFMAExpSimpleInterleaveID = 3, + MaxsID = 4 }; class MFMASmallGemmOpt final : public IGLPStrategy { @@ -2335,6 +2379,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, return std::make_unique(DAG, TII); case MFMAExpSimpleInterleaveID: return std::make_unique(DAG, TII); + case MaxsID: + return std::make_unique(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -2599,10 +2645,14 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { + // llvm::dbgs() << "before pipeline solver\n"; + // DAG->dump(); PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); // PipelineSolver performs the mutation by adding the edges it // determined as the best PS.solve(); + // llvm::dbgs() << "after pipeline solver\n"; + // DAG->dump(); return; } } diff --git a/llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn new file mode 100644 index 0000000000000..9e8e5fabec7d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn @@ -0,0 +1,4361 @@ + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx942" + .amdhsa_code_object_version 4 + .globl matmul_kernel ; -- Begin function matmul_kernel + .p2align 8 + .type matmul_kernel,@function +matmul_kernel: ; @matmul_kernel +.Lfunc_begin0: + .cfi_sections .debug_frame + .cfi_startproc + s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. + .fill 63, 4, 0xbf800000 ; s_nop 0 +; %bb.0: + .file 1 "" + .loc 1 0 0 prologue_end ; :0:0 + s_add_i32 s9, s11, 0xff + s_ashr_i32 s11, s9, 31 + s_lshr_b32 s11, s11, 24 + s_add_i32 s9, s9, s11 + s_ashr_i32 s9, s9, 8 + s_lshl_b32 s11, s9, 2 + s_abs_i32 s16, s11 + v_cvt_f32_u32_e32 v1, s16 + s_ashr_i32 s8, s15, 31 + s_lshr_b32 s8, s8, 29 + s_add_i32 s8, s15, s8 + v_rcp_iflag_f32_e32 v1, v1 + s_ashr_i32 s8, s8, 3 + s_sub_i32 s17, 0, s16 + s_mulk_i32 s15, 0x4c + v_mul_f32_e32 v1, 0x4f7ffffe, v1 + v_cvt_u32_f32_e32 v1, v1 + s_mulk_i32 s8, 0xfda1 + s_add_i32 s8, s8, s15 + s_abs_i32 s15, s8 + v_readfirstlane_b32 s18, v1 + s_mul_i32 s17, s17, s18 + s_mul_hi_u32 s17, s18, s17 + s_add_i32 s18, s18, s17 + s_mul_hi_u32 s17, s15, s18 + s_mul_i32 s18, s17, s16 + s_xor_b32 s9, s8, s9 + s_sub_i32 s15, s15, s18 + s_ashr_i32 s9, s9, 31 + s_add_i32 s18, s17, 1 + s_sub_i32 s19, s15, s16 + s_cmp_ge_u32 s15, s16 + s_cselect_b32 s17, s18, s17 + s_cselect_b32 s15, s19, s15 + s_add_i32 s18, s17, 1 + s_cmp_ge_u32 s15, s16 + s_cselect_b32 s15, s18, s17 + s_addk_i32 s10, 0xff + s_ashr_i32 s16, s10, 31 + s_xor_b32 s15, s15, s9 + s_lshr_b32 s16, s16, 24 + s_sub_i32 s9, s15, s9 + s_add_i32 s10, s10, s16 + s_lshl_b32 s15, s9, 2 + s_ashr_i32 s10, s10, 8 + s_sub_i32 s10, s10, s15 + s_min_i32 s10, s10, 4 + s_abs_i32 s16, s10 + v_cvt_f32_u32_e32 v1, s16 + s_sub_i32 s17, 0, s16 + s_mul_i32 s9, s9, s11 + s_sub_i32 s8, s8, s9 + v_rcp_iflag_f32_e32 v1, v1 + s_abs_i32 s11, s8 + s_xor_b32 s9, s8, s10 + s_ashr_i32 s9, s9, 31 + v_mul_f32_e32 v1, 0x4f7ffffe, v1 + v_cvt_u32_f32_e32 v1, v1 + v_lshlrev_b32_e32 v130, 3, v0 + v_and_b32_e32 v2, 56, v130 + v_xor_b32_e32 v130, v130, v0 + v_readfirstlane_b32 s18, v1 + s_mul_i32 s17, s17, s18 + s_mul_hi_u32 s17, s18, s17 + s_add_i32 s18, s18, s17 + s_mul_hi_u32 s17, s11, s18 + s_mul_i32 s18, s17, s16 + s_sub_i32 s11, s11, s18 + s_add_i32 s18, s17, 1 + s_sub_i32 s19, s11, s16 + s_cmp_ge_u32 s11, s16 + s_cselect_b32 s17, s18, s17 + s_cselect_b32 s11, s19, s11 + s_add_i32 s18, s17, 1 + s_cmp_ge_u32 s11, s16 + s_cselect_b32 s11, s18, s17 + s_xor_b32 s11, s11, s9 + s_sub_i32 s11, s11, s9 + s_mul_i32 s9, s11, s10 + s_sub_i32 s28, s8, s9 + s_add_i32 s28, s28, s15 + s_lshl_b32 s15, s28, 8 + s_mul_i32 s8, s15, s13 + s_ashr_i32 s9, s8, 31 + s_lshl_b32 s10, s13, 5 + s_lshl_b64 s[8:9], s[8:9], 1 + s_add_u32 s16, s2, s8 + s_addc_u32 s17, s3, s9 + v_lshrrev_b32_e32 v1, 3, v0 + s_add_u32 s20, s16, 0x80 + v_mad_u64_u32 v[4:5], s[8:9], s13, v1, v[2:3] + s_addc_u32 s21, s17, 0 + s_add_i32 s29, s12, 63 + v_add_u32_e32 v3, s10, v4 + s_cmp_gt_i32 s29, 63 + v_lshlrev_b32_e32 v155, 1, v4 + v_bfrev_b32_e32 v4, 1 + s_cselect_b64 s[8:9], -1, 0 + v_lshlrev_b32_e32 v156, 1, v3 + s_and_b32 s17, s17, 0xffff + s_mov_b32 s19, 0x27000 + s_mov_b32 s18, 0x7ffffffe + v_cndmask_b32_e64 v5, v4, v155, s[8:9] + v_add_u32_e32 v6, s10, v3 + v_cndmask_b32_e64 v3, v4, v156, s[8:9] + buffer_load_dwordx4 v[66:69], v5, s[16:19], 0 offen + buffer_load_dwordx4 v[70:73], v3, s[16:19], 0 offen + v_add_u32_e32 v3, s10, v6 + v_lshlrev_b32_e32 v157, 1, v6 + v_cndmask_b32_e64 v5, v4, v157, s[8:9] + v_add_u32_e32 v6, s10, v3 + v_lshlrev_b32_e32 v158, 1, v3 + v_cndmask_b32_e64 v3, v4, v158, s[8:9] + buffer_load_dwordx4 v[74:77], v5, s[16:19], 0 offen + buffer_load_dwordx4 v[78:81], v3, s[16:19], 0 offen + v_add_u32_e32 v5, s10, v6 + v_lshlrev_b32_e32 v159, 1, v6 + v_add_u32_e32 v6, s10, v5 + s_cmpk_gt_i32 s29, 0x7f + v_cndmask_b32_e64 v3, v4, v159, s[8:9] + v_lshlrev_b32_e32 v160, 1, v5 + v_lshlrev_b32_e32 v161, 1, v6 + s_cselect_b64 vcc, -1, 0 + s_lshl_b32 s12, s11, 8 + v_cndmask_b32_e64 v5, v4, v160, s[8:9] + buffer_load_dwordx4 v[82:85], v3, s[16:19], 0 offen + buffer_load_dwordx4 v[86:89], v5, s[16:19], 0 offen + v_cndmask_b32_e64 v3, v4, v161, s[8:9] + v_add_lshl_u32 v162, v6, s10, 1 + s_mul_i32 s10, s12, s14 + v_cndmask_b32_e64 v5, v4, v162, s[8:9] + buffer_load_dwordx4 v[90:93], v3, s[16:19], 0 offen + buffer_load_dwordx4 v[94:97], v5, s[16:19], 0 offen + s_lshl_b32 s26, s14, 5 + s_ashr_i32 s11, s10, 31 + v_mad_u64_u32 v[2:3], s[24:25], s14, v1, v[2:3] + s_and_b32 s21, s21, 0xffff + s_lshl_b64 s[10:11], s[10:11], 1 + v_add_u32_e32 v3, s26, v2 + s_add_u32 s16, s4, s10 + v_add_u32_e32 v6, s26, v3 + s_addc_u32 s17, s5, s11 + v_add_u32_e32 v7, s26, v6 + v_add_u32_e32 v8, s26, v7 + s_add_u32 s24, s16, 0x80 + v_lshlrev_b32_e32 v163, 1, v2 + v_add_u32_e32 v9, s26, v8 + s_addc_u32 s14, s17, 0 + s_and_b32 s17, s17, 0xffff + v_cndmask_b32_e64 v2, v4, v163, s[8:9] + v_lshlrev_b32_e32 v164, 1, v3 + v_lshlrev_b32_e32 v165, 1, v6 + v_add_u32_e32 v10, s26, v9 + v_cndmask_b32_e64 v3, v4, v164, s[8:9] + buffer_load_dwordx4 v[98:101], v2, s[16:19], 0 offen + buffer_load_dwordx4 v[102:105], v3, s[16:19], 0 offen + v_cndmask_b32_e64 v2, v4, v165, s[8:9] + v_lshlrev_b32_e32 v166, 1, v7 + v_lshlrev_b32_e32 v167, 1, v8 + v_cndmask_b32_e64 v3, v4, v166, s[8:9] + buffer_load_dwordx4 v[106:109], v2, s[16:19], 0 offen + buffer_load_dwordx4 v[110:113], v3, s[16:19], 0 offen + v_cndmask_b32_e64 v2, v4, v167, s[8:9] + v_lshlrev_b32_e32 v168, 1, v9 + v_lshlrev_b32_e32 v169, 1, v10 + v_cndmask_b32_e64 v3, v4, v168, s[8:9] + buffer_load_dwordx4 v[114:117], v2, s[16:19], 0 offen + buffer_load_dwordx4 v[118:121], v3, s[16:19], 0 offen + v_cndmask_b32_e64 v2, v4, v169, s[8:9] + v_add_lshl_u32 v170, v10, s26, 1 + v_cndmask_b32_e64 v3, v4, v170, s[8:9] + buffer_load_dwordx4 v[122:125], v2, s[16:19], 0 offen + buffer_load_dwordx4 v[126:129], v3, s[16:19], 0 offen + s_mov_b32 s22, s18 + s_mov_b32 s23, s19 + v_cndmask_b32_e32 v5, v4, v155, vcc + v_cndmask_b32_e32 v2, v4, v156, vcc + buffer_load_dwordx4 v[54:57], v5, s[20:23], 0 offen + buffer_load_dwordx4 v[50:53], v2, s[20:23], 0 offen + v_cndmask_b32_e32 v2, v4, v157, vcc + v_cndmask_b32_e32 v3, v4, v158, vcc + buffer_load_dwordx4 v[46:49], v2, s[20:23], 0 offen + buffer_load_dwordx4 v[58:61], v3, s[20:23], 0 offen + v_cndmask_b32_e32 v2, v4, v159, vcc + v_cndmask_b32_e32 v3, v4, v160, vcc + buffer_load_dwordx4 v[38:41], v2, s[20:23], 0 offen + buffer_load_dwordx4 v[62:65], v3, s[20:23], 0 offen + v_cndmask_b32_e32 v2, v4, v161, vcc + v_cndmask_b32_e32 v3, v4, v162, vcc + buffer_load_dwordx4 v[34:37], v2, s[20:23], 0 offen + buffer_load_dwordx4 v[26:29], v3, s[20:23], 0 offen + s_and_b32 s25, s14, 0xffff + s_mov_b32 s26, s18 + s_mov_b32 s27, s19 + v_cndmask_b32_e32 v2, v4, v163, vcc + v_cndmask_b32_e32 v3, v4, v164, vcc + buffer_load_dwordx4 v[30:33], v2, s[24:27], 0 offen + buffer_load_dwordx4 v[22:25], v3, s[24:27], 0 offen + v_cndmask_b32_e32 v2, v4, v165, vcc + v_cndmask_b32_e32 v3, v4, v166, vcc + buffer_load_dwordx4 v[18:21], v2, s[24:27], 0 offen + buffer_load_dwordx4 v[14:17], v3, s[24:27], 0 offen + v_cndmask_b32_e32 v2, v4, v167, vcc + v_cndmask_b32_e32 v3, v4, v168, vcc + buffer_load_dwordx4 v[10:13], v2, s[24:27], 0 offen + buffer_load_dwordx4 v[6:9], v3, s[24:27], 0 offen + v_cndmask_b32_e32 v2, v4, v169, vcc + v_cndmask_b32_e32 v42, v4, v170, vcc + buffer_load_dwordx4 v[2:5], v2, s[24:27], 0 offen + s_nop 0 + buffer_load_dwordx4 v[42:45], v42, s[24:27], 0 offen + v_and_b32_e32 v130, 56, v130 + v_lshlrev_b32_e32 v130, 1, v130 + v_lshl_or_b32 v130, v1, 7, v130 + s_add_i32 s14, 0, 0x8000 + v_add_u32_e32 v132, 0, v130 + v_add_u32_e32 v131, s14, v130 + s_waitcnt vmcnt(31) + ds_write_b128 v132, v[66:69] + s_waitcnt vmcnt(30) + ds_write_b128 v132, v[70:73] offset:4096 + s_waitcnt vmcnt(29) + ds_write_b128 v132, v[74:77] offset:8192 + s_waitcnt vmcnt(28) + ds_write_b128 v132, v[78:81] offset:12288 + s_waitcnt vmcnt(27) + ds_write_b128 v132, v[82:85] offset:16384 + s_waitcnt vmcnt(26) + ds_write_b128 v132, v[86:89] offset:20480 + s_waitcnt vmcnt(25) + ds_write_b128 v132, v[90:93] offset:24576 + s_waitcnt vmcnt(24) + ds_write_b128 v132, v[94:97] offset:28672 + s_waitcnt vmcnt(23) + ds_write_b128 v132, v[98:101] offset:32768 + s_waitcnt vmcnt(22) + ds_write_b128 v131, v[102:105] offset:4096 + s_waitcnt vmcnt(21) + ds_write_b128 v131, v[106:109] offset:8192 + s_waitcnt vmcnt(20) + ds_write_b128 v131, v[110:113] offset:12288 + s_waitcnt vmcnt(19) + ds_write_b128 v131, v[114:117] offset:16384 + s_waitcnt vmcnt(18) + ds_write_b128 v131, v[118:121] offset:20480 + s_waitcnt vmcnt(17) + ds_write_b128 v131, v[122:125] offset:24576 + s_waitcnt vmcnt(16) + ds_write_b128 v131, v[126:129] offset:28672 + v_and_b32_e32 v66, 15, v0 + v_bfe_u32 v106, v0, 4, 2 + v_and_b32_e32 v99, 7, v0 + v_lshrrev_b32_e32 v152, 2, v0 + v_and_or_b32 v130, v1, 16, v66 + v_xor_b32_e32 v1, v106, v99 + v_and_or_b32 v0, v152, 16, v66 + v_lshlrev_b32_e32 v102, 3, v1 + v_lshlrev_b32_e32 v101, 6, v0 + v_lshlrev_b32_e32 v98, 6, v130 + v_or_b32_e32 v0, v102, v101 + v_or_b32_e32 v1, v98, v102 + v_lshlrev_b32_e32 v0, 1, v0 + v_lshl_add_u32 v1, v1, 1, 0 + v_add_u32_e32 v133, 0, v0 + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[94:97], v1 + ds_read_b128 v[90:93], v1 offset:4096 + v_add_u32_e32 v0, s14, v0 + ds_read_b128 v[78:81], v133 offset:32768 + ds_read_b128 v[74:77], v0 offset:4096 + ds_read_b128 v[86:89], v1 offset:8192 + ds_read_b128 v[82:85], v1 offset:12288 + ds_read_b128 v[70:73], v0 offset:8192 + ds_read_b128 v[66:69], v0 offset:12288 + s_mov_b32 s16, 0 + v_or_b32_e32 v100, 0x800, v98 + v_or_b32_e32 v103, 0x1000, v98 + v_or_b32_e32 v104, 0x1800, v98 + v_or_b32_e32 v105, 0x1000, v101 + s_cmpk_gt_i32 s29, 0xbf + v_or_b32_e32 v106, 4, v106 + s_cbranch_scc1 .LBB0_3 +; %bb.1: ; %.._crit_edge_crit_edge + v_xor_b32_e32 v113, v106, v99 + v_or_b32_e32 v107, 0x2000, v101 + v_or_b32_e32 v108, 0x3000, v101 + v_lshlrev_b32_e32 v113, 3, v113 + v_or_b32_e32 v109, 0x2000, v98 + v_or_b32_e32 v110, 0x2800, v98 + v_or_b32_e32 v111, 0x3000, v98 + v_or_b32_e32 v112, 0x3800, v98 + v_or_b32_e32 v140, v113, v107 + v_or_b32_e32 v138, v113, v108 + v_or_b32_e32 v154, v107, v102 + v_or_b32_e32 v153, v108, v102 + v_or_b32_e32 v151, v109, v102 + v_or_b32_e32 v150, v110, v102 + v_or_b32_e32 v149, v111, v102 + v_or_b32_e32 v148, v112, v102 + v_or_b32_e32 v147, v113, v101 + v_or_b32_e32 v145, v113, v98 + v_or_b32_e32 v146, v100, v113 + v_or_b32_e32 v144, v113, v105 + v_or_b32_e32 v142, v103, v113 + v_or_b32_e32 v143, v104, v113 + v_or_b32_e32 v141, 0x800, v140 + v_or_b32_e32 v139, 0x800, v138 + v_or_b32_e32 v136, v109, v113 + v_or_b32_e32 v137, v110, v113 + v_or_b32_e32 v134, v111, v113 + v_or_b32_e32 v135, v112, v113 + s_cbranch_execz .LBB0_4 +; %bb.2: + v_accvgpr_write_b32 a0, s16 + v_accvgpr_write_b32 a1, s16 + v_accvgpr_write_b32 a2, s16 + v_accvgpr_write_b32 a3, s16 + v_accvgpr_write_b32 a4, s16 + v_accvgpr_write_b32 a5, s16 + v_accvgpr_write_b32 a6, s16 + v_accvgpr_write_b32 a7, s16 + v_accvgpr_write_b32 a8, s16 + v_accvgpr_write_b32 a9, s16 + v_accvgpr_write_b32 a10, s16 + v_accvgpr_write_b32 a11, s16 + v_accvgpr_write_b32 a12, s16 + v_accvgpr_write_b32 a13, s16 + v_accvgpr_write_b32 a14, s16 + v_accvgpr_write_b32 a15, s16 + v_accvgpr_write_b32 a16, s16 + v_accvgpr_write_b32 a17, s16 + v_accvgpr_write_b32 a18, s16 + v_accvgpr_write_b32 a19, s16 + v_accvgpr_write_b32 a20, s16 + v_accvgpr_write_b32 a21, s16 + v_accvgpr_write_b32 a22, s16 + v_accvgpr_write_b32 a23, s16 + v_accvgpr_write_b32 a24, s16 + v_accvgpr_write_b32 a25, s16 + v_accvgpr_write_b32 a26, s16 + v_accvgpr_write_b32 a27, s16 + v_accvgpr_write_b32 a28, s16 + v_accvgpr_write_b32 a29, s16 + v_accvgpr_write_b32 a30, s16 + v_accvgpr_write_b32 a31, s16 + v_accvgpr_write_b32 a64, s16 + v_accvgpr_write_b32 a65, s16 + v_accvgpr_write_b32 a66, s16 + v_accvgpr_write_b32 a67, s16 + v_accvgpr_write_b32 a68, s16 + v_accvgpr_write_b32 a69, s16 + v_accvgpr_write_b32 a70, s16 + v_accvgpr_write_b32 a71, s16 + v_accvgpr_write_b32 a72, s16 + v_accvgpr_write_b32 a73, s16 + v_accvgpr_write_b32 a74, s16 + v_accvgpr_write_b32 a75, s16 + v_accvgpr_write_b32 a76, s16 + v_accvgpr_write_b32 a77, s16 + v_accvgpr_write_b32 a78, s16 + v_accvgpr_write_b32 a79, s16 + v_accvgpr_write_b32 a80, s16 + v_accvgpr_write_b32 a81, s16 + v_accvgpr_write_b32 a82, s16 + v_accvgpr_write_b32 a83, s16 + v_accvgpr_write_b32 a84, s16 + v_accvgpr_write_b32 a85, s16 + v_accvgpr_write_b32 a86, s16 + v_accvgpr_write_b32 a87, s16 + v_accvgpr_write_b32 a88, s16 + v_accvgpr_write_b32 a89, s16 + v_accvgpr_write_b32 a90, s16 + v_accvgpr_write_b32 a91, s16 + v_accvgpr_write_b32 a92, s16 + v_accvgpr_write_b32 a93, s16 + v_accvgpr_write_b32 a94, s16 + v_accvgpr_write_b32 a95, s16 + v_accvgpr_write_b32 a32, s16 + v_accvgpr_write_b32 a33, s16 + v_accvgpr_write_b32 a34, s16 + v_accvgpr_write_b32 a35, s16 + v_accvgpr_write_b32 a36, s16 + v_accvgpr_write_b32 a37, s16 + v_accvgpr_write_b32 a38, s16 + v_accvgpr_write_b32 a39, s16 + v_accvgpr_write_b32 a40, s16 + v_accvgpr_write_b32 a41, s16 + v_accvgpr_write_b32 a42, s16 + v_accvgpr_write_b32 a43, s16 + v_accvgpr_write_b32 a44, s16 + v_accvgpr_write_b32 a45, s16 + v_accvgpr_write_b32 a46, s16 + v_accvgpr_write_b32 a47, s16 + v_accvgpr_write_b32 a48, s16 + v_accvgpr_write_b32 a49, s16 + v_accvgpr_write_b32 a50, s16 + v_accvgpr_write_b32 a51, s16 + v_accvgpr_write_b32 a52, s16 + v_accvgpr_write_b32 a53, s16 + v_accvgpr_write_b32 a54, s16 + v_accvgpr_write_b32 a55, s16 + v_accvgpr_write_b32 a56, s16 + v_accvgpr_write_b32 a57, s16 + v_accvgpr_write_b32 a58, s16 + v_accvgpr_write_b32 a59, s16 + v_accvgpr_write_b32 a60, s16 + v_accvgpr_write_b32 a61, s16 + v_accvgpr_write_b32 a62, s16 + v_accvgpr_write_b32 a63, s16 + v_accvgpr_write_b32 a96, s16 + v_accvgpr_write_b32 a97, s16 + v_accvgpr_write_b32 a98, s16 + v_accvgpr_write_b32 a99, s16 + v_accvgpr_write_b32 a100, s16 + v_accvgpr_write_b32 a101, s16 + v_accvgpr_write_b32 a102, s16 + v_accvgpr_write_b32 a103, s16 + v_accvgpr_write_b32 a104, s16 + v_accvgpr_write_b32 a105, s16 + v_accvgpr_write_b32 a106, s16 + v_accvgpr_write_b32 a107, s16 + v_accvgpr_write_b32 a108, s16 + v_accvgpr_write_b32 a109, s16 + v_accvgpr_write_b32 a110, s16 + v_accvgpr_write_b32 a111, s16 + v_accvgpr_write_b32 a112, s16 + v_accvgpr_write_b32 a113, s16 + v_accvgpr_write_b32 a114, s16 + v_accvgpr_write_b32 a115, s16 + v_accvgpr_write_b32 a116, s16 + v_accvgpr_write_b32 a117, s16 + v_accvgpr_write_b32 a118, s16 + v_accvgpr_write_b32 a119, s16 + v_accvgpr_write_b32 a120, s16 + v_accvgpr_write_b32 a121, s16 + v_accvgpr_write_b32 a122, s16 + v_accvgpr_write_b32 a123, s16 + v_accvgpr_write_b32 a124, s16 + v_accvgpr_write_b32 a125, s16 + v_accvgpr_write_b32 a126, s16 + v_accvgpr_write_b32 a127, s16 + v_accvgpr_write_b32 a132, s16 + v_accvgpr_write_b32 a133, s16 + v_accvgpr_write_b32 a134, s16 + v_accvgpr_write_b32 a135, s16 + v_accvgpr_write_b32 a136, s16 + v_accvgpr_write_b32 a137, s16 + v_accvgpr_write_b32 a138, s16 + v_accvgpr_write_b32 a139, s16 + v_accvgpr_write_b32 a140, s16 + v_accvgpr_write_b32 a141, s16 + v_accvgpr_write_b32 a142, s16 + v_accvgpr_write_b32 a143, s16 + v_accvgpr_write_b32 a144, s16 + v_accvgpr_write_b32 a145, s16 + v_accvgpr_write_b32 a146, s16 + v_accvgpr_write_b32 a147, s16 + v_accvgpr_write_b32 a148, s16 + v_accvgpr_write_b32 a149, s16 + v_accvgpr_write_b32 a150, s16 + v_accvgpr_write_b32 a151, s16 + v_accvgpr_write_b32 a152, s16 + v_accvgpr_write_b32 a153, s16 + v_accvgpr_write_b32 a154, s16 + v_accvgpr_write_b32 a155, s16 + v_accvgpr_write_b32 a156, s16 + v_accvgpr_write_b32 a157, s16 + v_accvgpr_write_b32 a158, s16 + v_accvgpr_write_b32 a159, s16 + v_accvgpr_write_b32 a160, s16 + v_accvgpr_write_b32 a161, s16 + v_accvgpr_write_b32 a162, s16 + v_accvgpr_write_b32 a163, s16 + v_accvgpr_write_b32 a220, s16 + v_accvgpr_write_b32 a221, s16 + v_accvgpr_write_b32 a222, s16 + v_accvgpr_write_b32 a223, s16 + v_accvgpr_write_b32 a224, s16 + v_accvgpr_write_b32 a225, s16 + v_accvgpr_write_b32 a226, s16 + v_accvgpr_write_b32 a227, s16 + v_accvgpr_write_b32 a232, s16 + v_accvgpr_write_b32 a233, s16 + v_accvgpr_write_b32 a234, s16 + v_accvgpr_write_b32 a235, s16 + v_accvgpr_write_b32 a236, s16 + v_accvgpr_write_b32 a237, s16 + v_accvgpr_write_b32 a238, s16 + v_accvgpr_write_b32 a239, s16 + v_accvgpr_write_b32 a240, s16 + v_accvgpr_write_b32 a241, s16 + v_accvgpr_write_b32 a242, s16 + v_accvgpr_write_b32 a243, s16 + v_accvgpr_write_b32 a244, s16 + v_accvgpr_write_b32 a245, s16 + v_accvgpr_write_b32 a246, s16 + v_accvgpr_write_b32 a247, s16 + v_accvgpr_write_b32 a248, s16 + v_accvgpr_write_b32 a249, s16 + v_accvgpr_write_b32 a250, s16 + v_accvgpr_write_b32 a251, s16 + v_accvgpr_write_b32 a252, s16 + v_accvgpr_write_b32 a253, s16 + v_accvgpr_write_b32 a254, s16 + v_accvgpr_write_b32 a255, s16 + v_accvgpr_write_b32 a164, s16 + v_accvgpr_write_b32 a165, s16 + v_accvgpr_write_b32 a166, s16 + v_accvgpr_write_b32 a167, s16 + v_accvgpr_write_b32 a168, s16 + v_accvgpr_write_b32 a169, s16 + v_accvgpr_write_b32 a170, s16 + v_accvgpr_write_b32 a171, s16 + v_accvgpr_write_b32 a172, s16 + v_accvgpr_write_b32 a173, s16 + v_accvgpr_write_b32 a174, s16 + v_accvgpr_write_b32 a175, s16 + v_accvgpr_write_b32 a176, s16 + v_accvgpr_write_b32 a177, s16 + v_accvgpr_write_b32 a178, s16 + v_accvgpr_write_b32 a179, s16 + v_accvgpr_write_b32 a128, s16 + v_accvgpr_write_b32 a129, s16 + v_accvgpr_write_b32 a130, s16 + v_accvgpr_write_b32 a131, s16 + v_accvgpr_write_b32 a180, s16 + v_accvgpr_write_b32 a181, s16 + v_accvgpr_write_b32 a182, s16 + v_accvgpr_write_b32 a183, s16 + v_accvgpr_write_b32 a184, s16 + v_accvgpr_write_b32 a185, s16 + v_accvgpr_write_b32 a186, s16 + v_accvgpr_write_b32 a187, s16 + v_accvgpr_write_b32 a188, s16 + v_accvgpr_write_b32 a189, s16 + v_accvgpr_write_b32 a190, s16 + v_accvgpr_write_b32 a191, s16 + v_accvgpr_write_b32 a192, s16 + v_accvgpr_write_b32 a193, s16 + v_accvgpr_write_b32 a194, s16 + v_accvgpr_write_b32 a195, s16 + v_accvgpr_write_b32 a204, s16 + v_accvgpr_write_b32 a205, s16 + v_accvgpr_write_b32 a206, s16 + v_accvgpr_write_b32 a207, s16 + v_accvgpr_write_b32 a216, s16 + v_accvgpr_write_b32 a217, s16 + v_accvgpr_write_b32 a218, s16 + v_accvgpr_write_b32 a219, s16 + v_accvgpr_write_b32 a228, s16 + v_accvgpr_write_b32 a229, s16 + v_accvgpr_write_b32 a230, s16 + v_accvgpr_write_b32 a231, s16 + v_accvgpr_write_b32 a196, s16 + v_accvgpr_write_b32 a197, s16 + v_accvgpr_write_b32 a198, s16 + v_accvgpr_write_b32 a199, s16 + v_accvgpr_write_b32 a200, s16 + v_accvgpr_write_b32 a201, s16 + v_accvgpr_write_b32 a202, s16 + v_accvgpr_write_b32 a203, s16 + v_accvgpr_write_b32 a208, s16 + v_accvgpr_write_b32 a209, s16 + v_accvgpr_write_b32 a210, s16 + v_accvgpr_write_b32 a211, s16 + v_accvgpr_write_b32 a212, s16 + v_accvgpr_write_b32 a213, s16 + v_accvgpr_write_b32 a214, s16 + v_accvgpr_write_b32 a215, s16 + s_branch .LBB0_6 +.LBB0_3: + ; implicit-def: $sgpr16 + ; implicit-def: $vgpr154 + ; implicit-def: $vgpr153 + ; implicit-def: $vgpr151 + ; implicit-def: $vgpr150 + ; implicit-def: $vgpr149 + ; implicit-def: $vgpr148 + ; implicit-def: $vgpr147 + ; implicit-def: $vgpr145 + ; implicit-def: $vgpr146 + ; implicit-def: $vgpr144 + ; implicit-def: $vgpr142 + ; implicit-def: $vgpr143 + ; implicit-def: $vgpr140 + ; implicit-def: $vgpr141 + ; implicit-def: $vgpr138 + ; implicit-def: $vgpr139 + ; implicit-def: $vgpr136 + ; implicit-def: $vgpr137 + ; implicit-def: $vgpr134 + ; implicit-def: $vgpr135 +.LBB0_4: ; %.lr.ph + s_lshr_b32 s16, s29, 6 + s_add_u32 s4, s4, s10 + s_addc_u32 s5, s5, s11 + s_mul_i32 s13, s13, s28 + v_add_u32_e32 v108, v101, v102 + v_xor_b32_e32 v99, v106, v99 + v_lshl_add_u32 v108, v108, 1, s14 + v_lshlrev_b32_e32 v99, 3, v99 + v_or_b32_e32 v107, 0x2000, v101 + v_add_u32_e32 v171, 0x1000, v108 + v_or_b32_e32 v108, 0x3000, v101 + v_or_b32_e32 v109, 0x2000, v98 + v_or_b32_e32 v110, 0x2800, v98 + v_or_b32_e32 v111, 0x3000, v98 + v_or_b32_e32 v112, 0x3800, v98 + v_or_b32_e32 v140, v99, v107 + v_or_b32_e32 v138, v99, v108 + v_or_b32_e32 v143, v104, v99 + s_add_u32 s4, s4, 0x100 + s_addc_u32 s5, s5, 0 + s_lshl_b32 s8, s13, 8 + s_ashr_i32 s9, s8, 31 + s_lshl_b64 s[8:9], s[8:9], 1 + s_add_u32 s2, s8, s2 + s_addc_u32 s3, s9, s3 + s_add_u32 s2, s2, 0x100 + v_or_b32_e32 v141, 0x800, v140 + v_or_b32_e32 v148, v112, v102 + v_or_b32_e32 v149, v111, v102 + v_or_b32_e32 v150, v110, v102 + v_or_b32_e32 v151, v109, v102 + v_or_b32_e32 v153, v108, v102 + v_or_b32_e32 v154, v107, v102 + v_or_b32_e32 v145, v99, v98 + v_lshl_add_u32 v172, v145, 1, 0 + v_or_b32_e32 v146, v100, v99 + v_add_u32_e32 v98, v98, v99 + v_lshl_add_u32 v173, v98, 1, 0 + v_add_u32_e32 v98, v99, v101 + v_lshl_add_u32 v175, v98, 1, s14 + v_or_b32_e32 v147, v99, v101 + v_lshl_add_u32 v174, v147, 1, 0 + v_or_b32_e32 v144, v99, v105 + v_or_b32_e32 v142, v103, v99 + v_or_b32_e32 v139, 0x800, v138 + v_or_b32_e32 v136, v109, v99 + v_or_b32_e32 v137, v110, v99 + v_or_b32_e32 v134, v111, v99 + v_or_b32_e32 v135, v112, v99 + s_addc_u32 s3, s3, 0 + s_add_i32 s13, s16, -2 + v_accvgpr_write_b32 a15, 0 + v_accvgpr_write_b32 a14, 0 + v_accvgpr_write_b32 a13, 0 + v_accvgpr_write_b32 a12, 0 + v_accvgpr_write_b32 a11, 0 + v_accvgpr_write_b32 a10, 0 + v_accvgpr_write_b32 a9, 0 + v_accvgpr_write_b32 a8, 0 + v_accvgpr_write_b32 a7, 0 + v_accvgpr_write_b32 a6, 0 + v_accvgpr_write_b32 a5, 0 + v_accvgpr_write_b32 a4, 0 + v_accvgpr_write_b32 a3, 0 + v_accvgpr_write_b32 a2, 0 + v_accvgpr_write_b32 a1, 0 + v_accvgpr_write_b32 a0, 0 + v_accvgpr_write_b32 a31, 0 + v_accvgpr_write_b32 a30, 0 + v_accvgpr_write_b32 a29, 0 + v_accvgpr_write_b32 a28, 0 + v_accvgpr_write_b32 a27, 0 + v_accvgpr_write_b32 a26, 0 + v_accvgpr_write_b32 a25, 0 + v_accvgpr_write_b32 a24, 0 + v_accvgpr_write_b32 a23, 0 + v_accvgpr_write_b32 a22, 0 + v_accvgpr_write_b32 a21, 0 + v_accvgpr_write_b32 a20, 0 + v_accvgpr_write_b32 a19, 0 + v_accvgpr_write_b32 a18, 0 + v_accvgpr_write_b32 a17, 0 + v_accvgpr_write_b32 a16, 0 + v_accvgpr_write_b32 a79, 0 + v_accvgpr_write_b32 a78, 0 + v_accvgpr_write_b32 a77, 0 + v_accvgpr_write_b32 a76, 0 + v_accvgpr_write_b32 a75, 0 + v_accvgpr_write_b32 a74, 0 + v_accvgpr_write_b32 a73, 0 + v_accvgpr_write_b32 a72, 0 + v_accvgpr_write_b32 a71, 0 + v_accvgpr_write_b32 a70, 0 + v_accvgpr_write_b32 a69, 0 + v_accvgpr_write_b32 a68, 0 + v_accvgpr_write_b32 a67, 0 + v_accvgpr_write_b32 a66, 0 + v_accvgpr_write_b32 a65, 0 + v_accvgpr_write_b32 a64, 0 + v_accvgpr_write_b32 a95, 0 + v_accvgpr_write_b32 a94, 0 + v_accvgpr_write_b32 a93, 0 + v_accvgpr_write_b32 a92, 0 + v_accvgpr_write_b32 a91, 0 + v_accvgpr_write_b32 a90, 0 + v_accvgpr_write_b32 a89, 0 + v_accvgpr_write_b32 a88, 0 + v_accvgpr_write_b32 a87, 0 + v_accvgpr_write_b32 a86, 0 + v_accvgpr_write_b32 a85, 0 + v_accvgpr_write_b32 a84, 0 + v_accvgpr_write_b32 a83, 0 + v_accvgpr_write_b32 a82, 0 + v_accvgpr_write_b32 a81, 0 + v_accvgpr_write_b32 a80, 0 + v_accvgpr_write_b32 a47, 0 + v_accvgpr_write_b32 a46, 0 + v_accvgpr_write_b32 a45, 0 + v_accvgpr_write_b32 a44, 0 + v_accvgpr_write_b32 a43, 0 + v_accvgpr_write_b32 a42, 0 + v_accvgpr_write_b32 a41, 0 + v_accvgpr_write_b32 a40, 0 + v_accvgpr_write_b32 a39, 0 + v_accvgpr_write_b32 a38, 0 + v_accvgpr_write_b32 a37, 0 + v_accvgpr_write_b32 a36, 0 + v_accvgpr_write_b32 a35, 0 + v_accvgpr_write_b32 a34, 0 + v_accvgpr_write_b32 a33, 0 + v_accvgpr_write_b32 a32, 0 + v_accvgpr_write_b32 a63, 0 + v_accvgpr_write_b32 a62, 0 + v_accvgpr_write_b32 a61, 0 + v_accvgpr_write_b32 a60, 0 + v_accvgpr_write_b32 a59, 0 + v_accvgpr_write_b32 a58, 0 + v_accvgpr_write_b32 a57, 0 + v_accvgpr_write_b32 a56, 0 + v_accvgpr_write_b32 a55, 0 + v_accvgpr_write_b32 a54, 0 + v_accvgpr_write_b32 a53, 0 + v_accvgpr_write_b32 a52, 0 + v_accvgpr_write_b32 a51, 0 + v_accvgpr_write_b32 a50, 0 + v_accvgpr_write_b32 a49, 0 + v_accvgpr_write_b32 a48, 0 + v_accvgpr_write_b32 a111, 0 + v_accvgpr_write_b32 a110, 0 + v_accvgpr_write_b32 a109, 0 + v_accvgpr_write_b32 a108, 0 + v_accvgpr_write_b32 a107, 0 + v_accvgpr_write_b32 a106, 0 + v_accvgpr_write_b32 a105, 0 + v_accvgpr_write_b32 a104, 0 + v_accvgpr_write_b32 a103, 0 + v_accvgpr_write_b32 a102, 0 + v_accvgpr_write_b32 a101, 0 + v_accvgpr_write_b32 a100, 0 + v_accvgpr_write_b32 a99, 0 + v_accvgpr_write_b32 a98, 0 + v_accvgpr_write_b32 a97, 0 + v_accvgpr_write_b32 a96, 0 + v_accvgpr_write_b32 a127, 0 + v_accvgpr_write_b32 a126, 0 + v_accvgpr_write_b32 a125, 0 + v_accvgpr_write_b32 a124, 0 + v_accvgpr_write_b32 a123, 0 + v_accvgpr_write_b32 a122, 0 + v_accvgpr_write_b32 a121, 0 + v_accvgpr_write_b32 a120, 0 + v_accvgpr_write_b32 a119, 0 + v_accvgpr_write_b32 a118, 0 + v_accvgpr_write_b32 a117, 0 + v_accvgpr_write_b32 a116, 0 + v_accvgpr_write_b32 a115, 0 + v_accvgpr_write_b32 a114, 0 + v_accvgpr_write_b32 a113, 0 + v_accvgpr_write_b32 a112, 0 + v_accvgpr_write_b32 a147, 0 + v_accvgpr_write_b32 a146, 0 + v_accvgpr_write_b32 a145, 0 + v_accvgpr_write_b32 a144, 0 + v_accvgpr_write_b32 a143, 0 + v_accvgpr_write_b32 a142, 0 + v_accvgpr_write_b32 a141, 0 + v_accvgpr_write_b32 a140, 0 + v_accvgpr_write_b32 a139, 0 + v_accvgpr_write_b32 a138, 0 + v_accvgpr_write_b32 a137, 0 + v_accvgpr_write_b32 a136, 0 + v_accvgpr_write_b32 a135, 0 + v_accvgpr_write_b32 a134, 0 + v_accvgpr_write_b32 a133, 0 + v_accvgpr_write_b32 a132, 0 + v_accvgpr_write_b32 a163, 0 + v_accvgpr_write_b32 a162, 0 + v_accvgpr_write_b32 a161, 0 + v_accvgpr_write_b32 a160, 0 + v_accvgpr_write_b32 a159, 0 + v_accvgpr_write_b32 a158, 0 + v_accvgpr_write_b32 a157, 0 + v_accvgpr_write_b32 a156, 0 + v_accvgpr_write_b32 a155, 0 + v_accvgpr_write_b32 a154, 0 + v_accvgpr_write_b32 a153, 0 + v_accvgpr_write_b32 a152, 0 + v_accvgpr_write_b32 a151, 0 + v_accvgpr_write_b32 a150, 0 + v_accvgpr_write_b32 a149, 0 + v_accvgpr_write_b32 a148, 0 + v_accvgpr_write_b32 a239, 0 + v_accvgpr_write_b32 a238, 0 + v_accvgpr_write_b32 a237, 0 + v_accvgpr_write_b32 a236, 0 + v_accvgpr_write_b32 a235, 0 + v_accvgpr_write_b32 a234, 0 + v_accvgpr_write_b32 a233, 0 + v_accvgpr_write_b32 a232, 0 + v_accvgpr_write_b32 a227, 0 + v_accvgpr_write_b32 a226, 0 + v_accvgpr_write_b32 a225, 0 + v_accvgpr_write_b32 a224, 0 + v_accvgpr_write_b32 a223, 0 + v_accvgpr_write_b32 a222, 0 + v_accvgpr_write_b32 a221, 0 + v_accvgpr_write_b32 a220, 0 + v_accvgpr_write_b32 a255, 0 + v_accvgpr_write_b32 a254, 0 + v_accvgpr_write_b32 a253, 0 + v_accvgpr_write_b32 a252, 0 + v_accvgpr_write_b32 a251, 0 + v_accvgpr_write_b32 a250, 0 + v_accvgpr_write_b32 a249, 0 + v_accvgpr_write_b32 a248, 0 + v_accvgpr_write_b32 a247, 0 + v_accvgpr_write_b32 a246, 0 + v_accvgpr_write_b32 a245, 0 + v_accvgpr_write_b32 a244, 0 + v_accvgpr_write_b32 a243, 0 + v_accvgpr_write_b32 a242, 0 + v_accvgpr_write_b32 a241, 0 + v_accvgpr_write_b32 a240, 0 + v_accvgpr_write_b32 a179, 0 + v_accvgpr_write_b32 a178, 0 + v_accvgpr_write_b32 a177, 0 + v_accvgpr_write_b32 a176, 0 + v_accvgpr_write_b32 a175, 0 + v_accvgpr_write_b32 a174, 0 + v_accvgpr_write_b32 a173, 0 + v_accvgpr_write_b32 a172, 0 + v_accvgpr_write_b32 a171, 0 + v_accvgpr_write_b32 a170, 0 + v_accvgpr_write_b32 a169, 0 + v_accvgpr_write_b32 a168, 0 + v_accvgpr_write_b32 a167, 0 + v_accvgpr_write_b32 a166, 0 + v_accvgpr_write_b32 a165, 0 + v_accvgpr_write_b32 a164, 0 + v_accvgpr_write_b32 a191, 0 + v_accvgpr_write_b32 a190, 0 + v_accvgpr_write_b32 a189, 0 + v_accvgpr_write_b32 a188, 0 + v_accvgpr_write_b32 a187, 0 + v_accvgpr_write_b32 a186, 0 + v_accvgpr_write_b32 a185, 0 + v_accvgpr_write_b32 a184, 0 + v_accvgpr_write_b32 a183, 0 + v_accvgpr_write_b32 a182, 0 + v_accvgpr_write_b32 a181, 0 + v_accvgpr_write_b32 a180, 0 + v_accvgpr_write_b32 a131, 0 + v_accvgpr_write_b32 a130, 0 + v_accvgpr_write_b32 a129, 0 + v_accvgpr_write_b32 a128, 0 + v_accvgpr_write_b32 a231, 0 + v_accvgpr_write_b32 a230, 0 + v_accvgpr_write_b32 a229, 0 + v_accvgpr_write_b32 a228, 0 + v_accvgpr_write_b32 a219, 0 + v_accvgpr_write_b32 a218, 0 + v_accvgpr_write_b32 a217, 0 + v_accvgpr_write_b32 a216, 0 + v_accvgpr_write_b32 a207, 0 + v_accvgpr_write_b32 a206, 0 + v_accvgpr_write_b32 a205, 0 + v_accvgpr_write_b32 a204, 0 + v_accvgpr_write_b32 a195, 0 + v_accvgpr_write_b32 a194, 0 + v_accvgpr_write_b32 a193, 0 + v_accvgpr_write_b32 a192, 0 + v_accvgpr_write_b32 a215, 0 + v_accvgpr_write_b32 a214, 0 + v_accvgpr_write_b32 a213, 0 + v_accvgpr_write_b32 a212, 0 + v_accvgpr_write_b32 a211, 0 + v_accvgpr_write_b32 a210, 0 + v_accvgpr_write_b32 a209, 0 + v_accvgpr_write_b32 a208, 0 + v_accvgpr_write_b32 a203, 0 + v_accvgpr_write_b32 a202, 0 + v_accvgpr_write_b32 a201, 0 + v_accvgpr_write_b32 a200, 0 + v_accvgpr_write_b32 a199, 0 + v_accvgpr_write_b32 a198, 0 + v_accvgpr_write_b32 a197, 0 + v_accvgpr_write_b32 a196, 0 + s_mov_b32 s11, 0x27000 + s_mov_b32 s10, 0x7ffffffe +.LBB0_5: ; =>This Inner Loop Header: Depth=1 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16_f16 a[4:7], v[74:75], v[94:95], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[76:77], v[96:97], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[78:79], v[90:91], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[80:81], v[92:93], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[74:75], v[90:91], a[12:15] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[76:77], v[92:93], a[12:15] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[16:19], v[70:71], v[94:95], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[16:19], v[72:73], v[96:97], a[16:19] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[20:23], v[66:67], v[94:95], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[20:23], v[68:69], v[96:97], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[70:71], v[90:91], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[72:73], v[92:93], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[66:67], v[90:91], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[68:69], v[92:93], a[28:31] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[32:35], v[78:79], v[86:87], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[32:35], v[80:81], v[88:89], a[32:35] + ds_read_b128 v[106:109], v0 offset:16384 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[36:39], v[74:75], v[86:87], a[36:39] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[36:39], v[76:77], v[88:89], a[36:39] + ds_read_b128 v[110:113], v171 offset:16384 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[40:43], v[78:79], v[82:83], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[80:81], v[84:85], a[40:43] + ds_read_b128 v[98:101], v0 offset:24576 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[44:47], v[74:75], v[82:83], a[44:47] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[44:47], v[76:77], v[84:85], a[44:47] + ds_read_b128 v[102:105], v171 offset:24576 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[48:51], v[70:71], v[86:87], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[72:73], v[88:89], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[66:67], v[86:87], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[68:69], v[88:89], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[70:71], v[82:83], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[72:73], v[84:85], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[66:67], v[82:83], a[60:63] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[68:69], v[84:85], a[60:63] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_barrier mask(0x00000000) + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16_f16 a[64:67], v[106:107], v[94:95], a[64:67] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[64:67], v[108:109], v[96:97], a[64:67] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[68:71], v[110:111], v[94:95], a[68:71] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[68:71], v[112:113], v[96:97], a[68:71] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[106:107], v[90:91], a[72:75] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[108:109], v[92:93], a[72:75] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[110:111], v[90:91], a[76:79] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[112:113], v[92:93], a[76:79] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[80:83], v[98:99], v[94:95], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[100:101], v[96:97], a[80:83] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[84:87], v[102:103], v[94:95], a[84:87] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[84:87], v[104:105], v[96:97], a[84:87] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[88:91], v[98:99], v[90:91], a[88:91] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[88:91], v[100:101], v[92:93], a[88:91] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[102:103], v[90:91], a[92:95] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[104:105], v[92:93], a[92:95] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[96:99], v[106:107], v[86:87], a[96:99] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[96:99], v[108:109], v[88:89], a[96:99] + ds_read_b128 v[90:93], v1 offset:16384 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[100:103], v[110:111], v[86:87], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[112:113], v[88:89], a[100:103] + ds_read_b128 v[114:117], v1 offset:20480 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[104:107], v[106:107], v[82:83], a[104:107] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[104:107], v[108:109], v[84:85], a[104:107] + ds_read_b128 v[176:179], v1 offset:24576 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[108:111], v[110:111], v[82:83], a[108:111] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[112:113], v[84:85], a[108:111] + ds_read_b128 v[180:183], v1 offset:28672 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[112:115], v[98:99], v[86:87], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[112:115], v[100:101], v[88:89], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[102:103], v[86:87], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[104:105], v[88:89], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[120:123], v[98:99], v[82:83], a[120:123] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[120:123], v[100:101], v[84:85], a[120:123] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[124:127], v[102:103], v[82:83], a[124:127] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[124:127], v[104:105], v[84:85], a[124:127] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_barrier mask(0x00000000) + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16_f16 a[132:135], v[78:79], v[90:91], a[132:135] + ; sched_barrier mask(0x000007F6) + s_and_b32 s9, s3, 0xffff + s_mov_b32 s8, s2 + v_mfma_f32_16x16x16_f16 a[132:135], v[80:81], v[92:93], a[132:135] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + v_mfma_f32_16x16x16_f16 a[136:139], v[74:75], v[90:91], a[136:139] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[136:139], v[76:77], v[92:93], a[136:139] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[140:143], v[78:79], v[114:115], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[140:143], v[80:81], v[116:117], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[74:75], v[114:115], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[76:77], v[116:117], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[70:71], v[90:91], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[72:73], v[92:93], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[66:67], v[90:91], a[152:155] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[68:69], v[92:93], a[152:155] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[70:71], v[114:115], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[72:73], v[116:117], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[160:163], v[66:67], v[114:115], a[160:163] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[160:163], v[68:69], v[116:117], a[160:163] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[164:167], v[78:79], v[176:177], a[164:167] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[164:167], v[80:81], v[178:179], a[164:167] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[168:171], v[74:75], v[176:177], a[168:171] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[168:171], v[76:77], v[178:179], a[168:171] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[172:175], v[78:79], v[180:181], a[172:175] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[172:175], v[80:81], v[182:183], a[172:175] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[74:75], v[180:181], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[76:77], v[182:183], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[70:71], v[176:177], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[72:73], v[178:179], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[66:67], v[176:177], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[68:69], v[178:179], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[70:71], v[180:181], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[72:73], v[182:183], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[66:67], v[180:181], a[188:191] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[68:69], v[182:183], a[188:191] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + v_mfma_f32_16x16x16_f16 a[220:223], v[106:107], v[90:91], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[220:223], v[108:109], v[92:93], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[110:111], v[90:91], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[112:113], v[92:93], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[232:235], v[106:107], v[114:115], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[232:235], v[108:109], v[116:117], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[236:239], v[110:111], v[114:115], a[236:239] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[236:239], v[112:113], v[116:117], a[236:239] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + ds_read_b128 v[94:97], v172 + v_mfma_f32_16x16x16_f16 a[240:243], v[98:99], v[90:91], a[240:243] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[240:243], v[100:101], v[92:93], a[240:243] + ds_read_b128 v[118:121], v173 offset:4096 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[244:247], v[102:103], v[90:91], a[244:247] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[244:247], v[104:105], v[92:93], a[244:247] + ds_read_b128 v[78:81], v174 offset:32768 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[248:251], v[98:99], v[114:115], a[248:251] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[248:251], v[100:101], v[116:117], a[248:251] + ds_read_b128 v[82:85], v175 offset:4096 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[252:255], v[102:103], v[114:115], a[252:255] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[252:255], v[104:105], v[116:117], a[252:255] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + ds_read_b128 v[122:125], v173 offset:12288 + ds_read_b128 v[126:129], v173 offset:8192 + ds_read_b128 v[66:69], v175 offset:12288 + ds_read_b128 v[74:77], v175 offset:8192 + v_mfma_f32_16x16x16_f16 a[192:195], v[106:107], v[176:177], a[192:195] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[192:195], v[108:109], v[178:179], a[192:195] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[204:207], v[110:111], v[176:177], a[204:207] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[204:207], v[112:113], v[178:179], a[204:207] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[216:219], v[106:107], v[180:181], a[216:219] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[216:219], v[108:109], v[182:183], a[216:219] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[228:231], v[110:111], v[180:181], a[228:231] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[228:231], v[112:113], v[182:183], a[228:231] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + v_mfma_f32_16x16x16_f16 a[196:199], v[98:99], v[176:177], a[196:199] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[196:199], v[100:101], v[178:179], a[196:199] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[200:203], v[102:103], v[176:177], a[200:203] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[200:203], v[104:105], v[178:179], a[200:203] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[208:211], v[98:99], v[180:181], a[208:211] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[208:211], v[100:101], v[182:183], a[208:211] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[212:215], v[102:103], v[180:181], a[212:215] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[212:215], v[104:105], v[182:183], a[212:215] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16_f16 a[4:7], v[82:83], v[94:95], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[84:85], v[96:97], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[78:79], v[118:119], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[80:81], v[120:121], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[82:83], v[118:119], a[12:15] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[84:85], v[120:121], a[12:15] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + ds_read_b128 v[70:73], v175 offset:16384 + ds_read_b128 v[106:109], v175 offset:20480 + ds_read_b128 v[102:105], v175 offset:24576 + ds_read_b128 v[98:101], v175 offset:28672 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16_f16 a[16:19], v[74:75], v[94:95], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[16:19], v[76:77], v[96:97], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[20:23], v[66:67], v[94:95], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[20:23], v[68:69], v[96:97], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[74:75], v[118:119], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[76:77], v[120:121], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[66:67], v[118:119], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[68:69], v[120:121], a[28:31] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + ds_read_b128 v[90:93], v173 offset:16384 + ds_read_b128 v[86:89], v173 offset:20480 + ds_read_b128 v[114:117], v173 offset:24576 + ds_read_b128 v[110:113], v173 offset:28672 + v_mfma_f32_16x16x16_f16 a[32:35], v[78:79], v[126:127], a[32:35] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + s_barrier + v_mfma_f32_16x16x16_f16 a[32:35], v[80:81], v[128:129], a[32:35] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + v_mfma_f32_16x16x16_f16 a[36:39], v[82:83], v[126:127], a[36:39] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[36:39], v[84:85], v[128:129], a[36:39] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[78:79], v[122:123], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[80:81], v[124:125], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[44:47], v[82:83], v[122:123], a[44:47] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[44:47], v[84:85], v[124:125], a[44:47] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + v_mfma_f32_16x16x16_f16 a[48:51], v[74:75], v[126:127], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[76:77], v[128:129], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[66:67], v[126:127], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[68:69], v[128:129], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[74:75], v[122:123], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[76:77], v[124:125], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[66:67], v[122:123], a[60:63] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[68:69], v[124:125], a[60:63] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000406) + s_waitcnt vmcnt(12) + ds_write_b128 v132, v[58:61] offset:12288 + buffer_load_dwordx4 v[58:61], v158, s[8:11], 0 offen + ds_write_b128 v132, v[46:49] offset:8192 + buffer_load_dwordx4 v[46:49], v157, s[8:11], 0 offen + v_mfma_f32_16x16x16_f16 a[64:67], v[70:71], v[94:95], a[64:67] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[64:67], v[72:73], v[96:97], a[64:67] + s_waitcnt vmcnt(12) + ds_write_b128 v132, v[62:65] offset:20480 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[68:71], v[106:107], v[94:95], a[68:71] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[68:71], v[108:109], v[96:97], a[68:71] + buffer_load_dwordx4 v[62:65], v160, s[8:11], 0 offen + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[70:71], v[118:119], a[72:75] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[72:73], v[120:121], a[72:75] + ds_write_b128 v132, v[38:41] offset:16384 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[106:107], v[118:119], a[76:79] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[108:109], v[120:121], a[76:79] + buffer_load_dwordx4 v[38:41], v159, s[8:11], 0 offen + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[102:103], v[94:95], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[104:105], v[96:97], a[80:83] + s_waitcnt vmcnt(12) + ds_write_b128 v132, v[26:29] offset:28672 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[84:87], v[98:99], v[94:95], a[84:87] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[84:87], v[100:101], v[96:97], a[84:87] + buffer_load_dwordx4 v[26:29], v162, s[8:11], 0 offen + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[88:91], v[102:103], v[118:119], a[88:91] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[88:91], v[104:105], v[120:121], a[88:91] + ds_write_b128 v132, v[34:37] offset:24576 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[98:99], v[118:119], a[92:95] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[100:101], v[120:121], a[92:95] + buffer_load_dwordx4 v[34:37], v161, s[8:11], 0 offen + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[96:99], v[70:71], v[126:127], a[96:99] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[96:99], v[72:73], v[128:129], a[96:99] + ds_write_b128 v132, v[54:57] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[106:107], v[126:127], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[108:109], v[128:129], a[100:103] + buffer_load_dwordx4 v[54:57], v155, s[8:11], 0 offen + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[104:107], v[70:71], v[122:123], a[104:107] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[104:107], v[72:73], v[124:125], a[104:107] + ds_write_b128 v132, v[50:53] offset:4096 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[106:107], v[122:123], a[108:111] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[108:109], v[124:125], a[108:111] + ; sched_barrier mask(0x000007F6) + buffer_load_dwordx4 v[50:53], v156, s[8:11], 0 offen + s_and_b32 s9, s5, 0xffff + s_mov_b32 s8, s4 + v_mfma_f32_16x16x16_f16 a[112:115], v[102:103], v[126:127], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[112:115], v[104:105], v[128:129], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[98:99], v[126:127], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[100:101], v[128:129], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[120:123], v[102:103], v[122:123], a[120:123] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[120:123], v[104:105], v[124:125], a[120:123] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[124:127], v[98:99], v[122:123], a[124:127] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[124:127], v[100:101], v[124:125], a[124:127] + ; sched_barrier mask(0x00000406) + s_waitcnt vmcnt(15) + ds_write_b128 v132, v[30:33] offset:32768 + buffer_load_dwordx4 v[30:33], v163, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[22:25] offset:4096 + buffer_load_dwordx4 v[22:25], v164, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[18:21] offset:8192 + buffer_load_dwordx4 v[18:21], v165, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[14:17] offset:12288 + buffer_load_dwordx4 v[14:17], v166, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[10:13] offset:16384 + buffer_load_dwordx4 v[10:13], v167, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[6:9] offset:20480 + buffer_load_dwordx4 v[6:9], v168, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[2:5] offset:24576 + buffer_load_dwordx4 v[2:5], v169, s[8:11], 0 offen + s_waitcnt vmcnt(15) + ds_write_b128 v131, v[42:45] offset:28672 + buffer_load_dwordx4 v[42:45], v170, s[8:11], 0 offen + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[132:135], v[78:79], v[90:91], a[132:135] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[132:135], v[80:81], v[92:93], a[132:135] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[136:139], v[82:83], v[90:91], a[136:139] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[136:139], v[84:85], v[92:93], a[136:139] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[140:143], v[78:79], v[86:87], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[140:143], v[80:81], v[88:89], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[82:83], v[86:87], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[84:85], v[88:89], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[74:75], v[90:91], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[76:77], v[92:93], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[66:67], v[90:91], a[152:155] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[68:69], v[92:93], a[152:155] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[74:75], v[86:87], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[76:77], v[88:89], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[160:163], v[66:67], v[86:87], a[160:163] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[160:163], v[68:69], v[88:89], a[160:163] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[164:167], v[78:79], v[114:115], a[164:167] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[164:167], v[80:81], v[116:117], a[164:167] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[168:171], v[82:83], v[114:115], a[168:171] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[168:171], v[84:85], v[116:117], a[168:171] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[172:175], v[78:79], v[110:111], a[172:175] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[172:175], v[80:81], v[112:113], a[172:175] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[82:83], v[110:111], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[84:85], v[112:113], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[74:75], v[114:115], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[76:77], v[116:117], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[66:67], v[114:115], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[68:69], v[116:117], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[74:75], v[110:111], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[76:77], v[112:113], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[66:67], v[110:111], a[188:191] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[68:69], v[112:113], a[188:191] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[220:223], v[70:71], v[90:91], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[220:223], v[72:73], v[92:93], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[106:107], v[90:91], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[108:109], v[92:93], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[232:235], v[70:71], v[86:87], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[232:235], v[72:73], v[88:89], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[236:239], v[106:107], v[86:87], a[236:239] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[236:239], v[108:109], v[88:89], a[236:239] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[240:243], v[102:103], v[90:91], a[240:243] + s_waitcnt lgkmcnt(0) + ; sched_barrier mask(0x000007F6) + s_barrier + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[240:243], v[104:105], v[92:93], a[240:243] + ds_read_b128 v[94:97], v1 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[244:247], v[98:99], v[90:91], a[244:247] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[244:247], v[100:101], v[92:93], a[244:247] + ds_read_b128 v[90:93], v1 offset:4096 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[248:251], v[102:103], v[86:87], a[248:251] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[248:251], v[104:105], v[88:89], a[248:251] + ds_read_b128 v[78:81], v133 offset:32768 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[252:255], v[98:99], v[86:87], a[252:255] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[252:255], v[100:101], v[88:89], a[252:255] + ds_read_b128 v[74:77], v0 offset:4096 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[192:195], v[70:71], v[114:115], a[192:195] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[192:195], v[72:73], v[116:117], a[192:195] + ds_read_b128 v[86:89], v1 offset:8192 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[204:207], v[106:107], v[114:115], a[204:207] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[204:207], v[108:109], v[116:117], a[204:207] + ds_read_b128 v[82:85], v1 offset:12288 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[216:219], v[70:71], v[110:111], a[216:219] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[216:219], v[72:73], v[112:113], a[216:219] + ds_read_b128 v[70:73], v0 offset:8192 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + v_mfma_f32_16x16x16_f16 a[228:231], v[106:107], v[110:111], a[228:231] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[228:231], v[108:109], v[112:113], a[228:231] + ds_read_b128 v[66:69], v0 offset:12288 + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(2) SyncID(0) + ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[196:199], v[102:103], v[114:115], a[196:199] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[196:199], v[104:105], v[116:117], a[196:199] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[200:203], v[98:99], v[114:115], a[200:203] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[200:203], v[100:101], v[116:117], a[200:203] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[208:211], v[102:103], v[110:111], a[208:211] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[208:211], v[104:105], v[112:113], a[208:211] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[212:215], v[98:99], v[110:111], a[212:215] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[212:215], v[100:101], v[112:113], a[212:215] + ; sched_barrier mask(0x000007F6) + ; sched_group_barrier mask(0x00000008) size(8) SyncID(0) + ; sched_barrier mask(0x00000000) + s_add_u32 s4, s4, 0x80 + s_addc_u32 s5, s5, 0 + s_add_u32 s2, s2, 0x80 + s_addc_u32 s3, s3, 0 + s_add_i32 s13, s13, -1 + s_cmp_lg_u32 s13, 0 + s_cbranch_scc1 .LBB0_5 +.LBB0_6: ; %Flow430 + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[0:3] + ; sched_barrier mask(0x000007F6) + s_load_dword s4, s[0:1], 0x34 + v_and_b32_e32 v108, 28, v152 + v_or_b32_e32 v251, 0xe0, v108 + v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_or_b32_e32 v106, 0x60, v108 + v_or_b32_e32 v107, 64, v108 + v_or_b32_e32 v109, 32, v108 + s_nop 3 + v_accvgpr_read_b32 v105, a3 + v_accvgpr_read_b32 v104, a2 + v_accvgpr_read_b32 v103, a1 + v_accvgpr_read_b32 v102, a0 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[94:95], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v163, a3 + v_accvgpr_read_b32 v162, a2 + v_accvgpr_read_b32 v161, a1 + v_accvgpr_read_b32 v160, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[90:91], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v171, a3 + v_accvgpr_read_b32 v170, a2 + v_accvgpr_read_b32 v169, a1 + v_accvgpr_read_b32 v168, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[90:91], a[12:15] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v183, a3 + v_accvgpr_read_b32 v182, a2 + v_accvgpr_read_b32 v181, a1 + v_accvgpr_read_b32 v180, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[94:95], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v159, a3 + v_accvgpr_read_b32 v158, a2 + v_accvgpr_read_b32 v157, a1 + v_accvgpr_read_b32 v156, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[94:95], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v167, a3 + v_accvgpr_read_b32 v166, a2 + v_accvgpr_read_b32 v165, a1 + v_accvgpr_read_b32 v164, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[90:91], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v175, a3 + v_accvgpr_read_b32 v174, a2 + v_accvgpr_read_b32 v173, a1 + v_accvgpr_read_b32 v172, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[90:91], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v191, a3 + v_accvgpr_read_b32 v190, a2 + v_accvgpr_read_b32 v189, a1 + v_accvgpr_read_b32 v188, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[86:87], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[80:81], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[86:87], a[36:39] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v179, a3 + v_accvgpr_read_b32 v178, a2 + v_accvgpr_read_b32 v177, a1 + v_accvgpr_read_b32 v176, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[82:83], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v187, a3 + v_accvgpr_read_b32 v186, a2 + v_accvgpr_read_b32 v185, a1 + v_accvgpr_read_b32 v184, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[82:83], a[44:47] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v195, a3 + v_accvgpr_read_b32 v194, a2 + v_accvgpr_read_b32 v193, a1 + v_accvgpr_read_b32 v192, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[86:87], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v199, a3 + v_accvgpr_read_b32 v198, a2 + v_accvgpr_read_b32 v197, a1 + v_accvgpr_read_b32 v196, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[86:87], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v203, a3 + v_accvgpr_read_b32 v202, a2 + v_accvgpr_read_b32 v201, a1 + v_accvgpr_read_b32 v200, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[82:83], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v207, a3 + v_accvgpr_read_b32 v206, a2 + v_accvgpr_read_b32 v205, a1 + v_accvgpr_read_b32 v204, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[82:83], a[60:63] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v211, a3 + v_accvgpr_read_b32 v210, a2 + v_accvgpr_read_b32 v209, a1 + v_accvgpr_read_b32 v208, a0 + ; sched_barrier mask(0x00000000) + v_lshlrev_b32_e32 v111, 1, v154 + v_add_u32_e32 v110, 0, v111 + ds_read_b128 v[98:101], v110 offset:32768 + v_add_u32_e32 v111, s14, v111 + ds_read_b128 v[114:117], v111 offset:4096 + v_lshlrev_b32_e32 v113, 1, v153 + v_add_u32_e32 v112, 0, v113 + ds_read_b128 v[118:121], v112 offset:32768 + v_add_u32_e32 v113, s14, v113 + ds_read_b128 v[122:125], v113 offset:4096 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[94:95], a[64:67] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[100:101], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v215, a3 + v_accvgpr_read_b32 v214, a2 + v_accvgpr_read_b32 v213, a1 + v_accvgpr_read_b32 v212, a0 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[94:95], a[68:71] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[20:23], v[116:117], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[90:91], a[72:75] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[100:101], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v219, a3 + v_accvgpr_read_b32 v218, a2 + v_accvgpr_read_b32 v217, a1 + v_accvgpr_read_b32 v216, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[90:91], a[76:79] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[116:117], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[94:95], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[120:121], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v155, a3 + v_accvgpr_read_b32 v154, a2 + v_accvgpr_read_b32 v153, a1 + v_accvgpr_read_b32 v152, a0 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[94:95], a[84:87] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[16:19], v[124:125], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[90:91], a[88:91] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[120:121], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[90:91], a[92:95] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[124:125], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v223, a3 + v_accvgpr_read_b32 v222, a2 + v_accvgpr_read_b32 v221, a1 + v_accvgpr_read_b32 v220, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[86:87], a[96:99] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[100:101], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[86:87], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[116:117], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v227, a3 + v_accvgpr_read_b32 v226, a2 + v_accvgpr_read_b32 v225, a1 + v_accvgpr_read_b32 v224, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[82:83], a[104:107] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[100:101], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[82:83], a[108:111] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[116:117], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v231, a3 + v_accvgpr_read_b32 v230, a2 + v_accvgpr_read_b32 v229, a1 + v_accvgpr_read_b32 v228, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[86:87], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[120:121], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v235, a3 + v_accvgpr_read_b32 v234, a2 + v_accvgpr_read_b32 v233, a1 + v_accvgpr_read_b32 v232, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[86:87], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[124:125], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v239, a3 + v_accvgpr_read_b32 v238, a2 + v_accvgpr_read_b32 v237, a1 + v_accvgpr_read_b32 v236, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[82:83], a[120:123] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[120:121], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v243, a3 + v_accvgpr_read_b32 v242, a2 + v_accvgpr_read_b32 v241, a1 + v_accvgpr_read_b32 v240, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[82:83], a[124:127] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[124:125], v[84:85], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v247, a3 + v_accvgpr_read_b32 v246, a2 + v_accvgpr_read_b32 v245, a1 + v_accvgpr_read_b32 v244, a0 + ; sched_barrier mask(0x00000000) + v_lshl_add_u32 v82, v151, 1, 0 + ds_read_b128 v[86:89], v82 + v_lshl_add_u32 v83, v149, 1, 0 + ds_read_b128 v[94:97], v83 + v_lshl_add_u32 v84, v150, 1, 0 + ds_read_b128 v[90:93], v84 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[86:87], a[132:135] + ; sched_barrier mask(0x000007F6) + v_lshl_add_u32 v85, v148, 1, 0 + ds_read_b128 v[126:129], v85 + v_mfma_f32_16x16x16_f16 a[132:135], v[80:81], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[86:87], a[136:139] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[112:115], v[76:77], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[90:91], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[80:81], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[90:91], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[96:99], v[76:77], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[86:87], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[72:73], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[86:87], a[152:155] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[104:107], v[68:69], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[90:91], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v255, a3 + v_accvgpr_read_b32 v254, a2 + v_accvgpr_read_b32 v253, a1 + v_accvgpr_read_b32 v252, a0 + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[90:91], a[160:163] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[68:69], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[164:167] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[44:47], v[80:81], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[94:95], a[168:171] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[76:77], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[126:127], a[172:175] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[88:91], v[80:81], v[128:129], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[126:127], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[76:77], v[128:129], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[94:95], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[124:127], v[72:73], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[94:95], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[68:69], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[126:127], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[72:73], v[128:129], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[126:127], a[188:191] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[84:87], v[68:69], v[128:129], a[0:3] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[86:87], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[220:223], v[100:101], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[86:87], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[120:123], v[116:117], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[90:91], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[100:101], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[90:91], a[236:239] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[232:235], v[116:117], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[86:87], a[240:243] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[120:121], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[86:87], a[244:247] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[68:71], v[124:125], v[88:89], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[90:91], a[248:251] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[64:67], v[120:121], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[90:91], a[252:255] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[124:125], v[92:93], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[94:95], a[192:195] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[36:39], v[100:101], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[94:95], a[204:207] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[32:35], v[116:117], v[96:97], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[126:127], a[216:219] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[100:101], v[128:129], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[126:127], a[228:231] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[116:117], v[128:129], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[118:119], v[94:95], a[196:199] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[120:121], v[96:97], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[122:123], v[94:95], a[200:203] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[164:167], v[124:125], v[96:97], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[118:119], v[126:127], a[208:211] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[172:175], v[120:121], v[128:129], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[122:123], v[126:127], a[212:215] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[168:171], v[124:125], v[128:129], a[24:27] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + v_lshlrev_b32_e32 v70, 1, v147 + v_add_u32_e32 v80, 0, v70 + ds_read_b128 v[86:89], v80 offset:32768 + v_lshl_add_u32 v81, v145, 1, 0 + ds_read_b128 v[66:69], v81 + s_nop 0 + v_accvgpr_write_b32 a24, v102 + v_accvgpr_write_b32 a25, v103 + v_accvgpr_write_b32 a26, v104 + v_accvgpr_write_b32 a27, v105 + v_lshl_add_u32 v75, v146, 1, 0 + v_add_u32_e32 v76, s14, v70 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[66:67], a[24:27] + ds_read_b128 v[90:93], v75 + ds_read_b128 v[94:97], v76 offset:4096 + ; sched_barrier mask(0x000007F6) + v_lshlrev_b32_e32 v70, 1, v144 + v_add_u32_e32 v79, 0, v70 + v_mfma_f32_16x16x16_f16 a[204:207], v[88:89], v[68:69], a[24:27] + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[98:101], v79 offset:32768 + v_add_u32_e32 v77, s14, v70 + ds_read_b128 v[114:117], v77 offset:4096 + v_accvgpr_write_b32 a24, v160 + v_accvgpr_write_b32 a25, v161 + v_accvgpr_write_b32 a26, v162 + v_accvgpr_write_b32 a27, v163 + v_lshl_add_u32 v78, v142, 1, 0 + ds_read_b128 v[118:121], v78 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[66:67], a[24:27] + ; sched_barrier mask(0x000007F6) + v_lshl_add_u32 v74, v143, 1, 0 + ds_read_b128 v[122:125], v74 + v_mfma_f32_16x16x16_f16 a[208:211], v[96:97], v[68:69], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 3 + v_accvgpr_write_b32 a24, v168 + v_accvgpr_write_b32 a25, v169 + v_accvgpr_write_b32 a26, v170 + v_accvgpr_write_b32 a27, v171 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[90:91], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[212:215], v[88:89], v[92:93], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a24, v180 + v_accvgpr_write_b32 a25, v181 + v_accvgpr_write_b32 a26, v182 + v_accvgpr_write_b32 a27, v183 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[90:91], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[216:219], v[96:97], v[92:93], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a24, v156 + v_accvgpr_write_b32 a25, v157 + v_accvgpr_write_b32 a26, v158 + v_accvgpr_write_b32 a27, v159 + s_waitcnt lgkmcnt(3) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[66:67], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[244:247], v[100:101], v[68:69], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a24, v164 + v_accvgpr_write_b32 a25, v165 + v_accvgpr_write_b32 a26, v166 + v_accvgpr_write_b32 a27, v167 + s_waitcnt lgkmcnt(2) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[66:67], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[228:231], v[116:117], v[68:69], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a24, v172 + v_accvgpr_write_b32 a25, v173 + v_accvgpr_write_b32 a26, v174 + v_accvgpr_write_b32 a27, v175 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[90:91], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[248:251], v[100:101], v[92:93], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a24, v188 + v_accvgpr_write_b32 a25, v189 + v_accvgpr_write_b32 a26, v190 + v_accvgpr_write_b32 a27, v191 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[90:91], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[236:239], v[116:117], v[92:93], a[24:27] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[118:119], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[88:89], v[120:121], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v105, a27 + v_accvgpr_read_b32 v104, a26 + v_accvgpr_read_b32 v103, a25 + v_accvgpr_read_b32 v102, a24 + v_accvgpr_write_b32 a24, v176 + v_accvgpr_write_b32 a25, v177 + v_accvgpr_write_b32 a26, v178 + v_accvgpr_write_b32 a27, v179 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[118:119], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[96:97], v[120:121], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v159, a27 + v_accvgpr_read_b32 v158, a26 + v_accvgpr_read_b32 v157, a25 + v_accvgpr_read_b32 v156, a24 + v_accvgpr_write_b32 a24, v184 + v_accvgpr_write_b32 a25, v185 + v_accvgpr_write_b32 a26, v186 + v_accvgpr_write_b32 a27, v187 + s_waitcnt lgkmcnt(0) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[122:123], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[88:89], v[124:125], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v163, a27 + v_accvgpr_read_b32 v162, a26 + v_accvgpr_read_b32 v161, a25 + v_accvgpr_read_b32 v160, a24 + v_accvgpr_write_b32 a24, v192 + v_accvgpr_write_b32 a25, v193 + v_accvgpr_write_b32 a26, v194 + v_accvgpr_write_b32 a27, v195 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[122:123], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[96:97], v[124:125], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v167, a27 + v_accvgpr_read_b32 v166, a26 + v_accvgpr_read_b32 v165, a25 + v_accvgpr_read_b32 v164, a24 + v_accvgpr_write_b32 a24, v196 + v_accvgpr_write_b32 a25, v197 + v_accvgpr_write_b32 a26, v198 + v_accvgpr_write_b32 a27, v199 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[118:119], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[100:101], v[120:121], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v171, a27 + v_accvgpr_read_b32 v170, a26 + v_accvgpr_read_b32 v169, a25 + v_accvgpr_read_b32 v168, a24 + v_accvgpr_write_b32 a24, v200 + v_accvgpr_write_b32 a25, v201 + v_accvgpr_write_b32 a26, v202 + v_accvgpr_write_b32 a27, v203 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[118:119], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[116:117], v[120:121], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v175, a27 + v_accvgpr_read_b32 v174, a26 + v_accvgpr_read_b32 v173, a25 + v_accvgpr_read_b32 v172, a24 + v_accvgpr_write_b32 a24, v204 + v_accvgpr_write_b32 a25, v205 + v_accvgpr_write_b32 a26, v206 + v_accvgpr_write_b32 a27, v207 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[122:123], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[100:101], v[124:125], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v179, a27 + v_accvgpr_read_b32 v178, a26 + v_accvgpr_read_b32 v177, a25 + v_accvgpr_read_b32 v176, a24 + v_accvgpr_write_b32 a24, v208 + v_accvgpr_write_b32 a25, v209 + v_accvgpr_write_b32 a26, v210 + v_accvgpr_write_b32 a27, v211 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[122:123], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[116:117], v[124:125], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v183, a27 + v_accvgpr_read_b32 v182, a26 + v_accvgpr_read_b32 v181, a25 + v_accvgpr_read_b32 v180, a24 + ; sched_barrier mask(0x00000000) + v_lshl_add_u32 v73, v140, 1, 0 + ds_read_b128 v[126:129], v73 offset:32768 + v_lshl_add_u32 v71, v141, 1, 0 + ds_read_b128 v[140:143], v71 offset:32768 + v_accvgpr_write_b32 a24, v212 + v_accvgpr_write_b32 a25, v213 + v_accvgpr_write_b32 a26, v214 + v_accvgpr_write_b32 a27, v215 + v_lshl_add_u32 v72, v138, 1, 0 + ds_read_b128 v[144:147], v72 offset:32768 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[24:27], v[126:127], v[66:67], a[24:27] + v_lshl_add_u32 v70, v139, 1, 0 + ds_read_b128 v[148:151], v70 offset:32768 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[128:129], v[68:69], a[24:27] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[20:23], v[140:141], v[66:67], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[142:143], v[68:69], a[20:23] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a20, v216 + v_accvgpr_write_b32 a21, v217 + v_accvgpr_write_b32 a22, v218 + v_accvgpr_write_b32 a23, v219 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[20:23], v[126:127], v[90:91], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[128:129], v[92:93], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[140:141], v[90:91], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[20:23], v[142:143], v[92:93], a[8:11] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a8, v152 + v_accvgpr_write_b32 a9, v153 + v_accvgpr_write_b32 a10, v154 + v_accvgpr_write_b32 a11, v155 + s_waitcnt lgkmcnt(1) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[8:11], v[144:145], v[66:67], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[160:163], v[146:147], v[68:69], a[8:11] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[8:11], v[148:149], v[66:67], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[150:151], v[68:69], a[8:11] + ; sched_barrier mask(0x000007F6) + s_nop 1 + v_accvgpr_write_b32 a16, v220 + v_accvgpr_write_b32 a17, v221 + v_accvgpr_write_b32 a18, v222 + v_mfma_f32_16x16x16_f16 a[12:15], v[144:145], v[90:91], a[12:15] + v_accvgpr_write_b32 a19, v223 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[146:147], v[92:93], a[12:15] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[16:19], v[148:149], v[90:91], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[16:19], v[150:151], v[92:93], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[126:127], v[118:119], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[128:129], v[120:121], a[52:55] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a52, v224 + v_accvgpr_write_b32 a53, v225 + v_accvgpr_write_b32 a54, v226 + v_accvgpr_write_b32 a55, v227 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[52:55], v[140:141], v[118:119], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[140:143], v[142:143], v[120:121], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[126:127], v[122:123], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[128:129], v[124:125], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a48, v228 + v_accvgpr_write_b32 a49, v229 + v_accvgpr_write_b32 a50, v230 + v_accvgpr_write_b32 a51, v231 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[48:51], v[140:141], v[122:123], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[142:143], v[124:125], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a48, v232 + v_accvgpr_write_b32 a49, v233 + v_accvgpr_write_b32 a50, v234 + v_accvgpr_write_b32 a51, v235 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[48:51], v[144:145], v[118:119], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[146:147], v[120:121], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a48, v236 + v_accvgpr_write_b32 a49, v237 + v_accvgpr_write_b32 a50, v238 + v_accvgpr_write_b32 a51, v239 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[48:51], v[148:149], v[118:119], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[150:151], v[120:121], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a48, v240 + v_accvgpr_write_b32 a49, v241 + v_accvgpr_write_b32 a50, v242 + v_accvgpr_write_b32 a51, v243 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[48:51], v[144:145], v[122:123], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[146:147], v[124:125], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a48, v244 + v_accvgpr_write_b32 a49, v245 + v_accvgpr_write_b32 a50, v246 + v_accvgpr_write_b32 a51, v247 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[48:51], v[148:149], v[122:123], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[150:151], v[124:125], a[48:51] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + v_lshl_add_u32 v69, v136, 1, 0 + ds_read_b128 v[90:93], v69 + v_lshl_add_u32 v68, v137, 1, 0 + ds_read_b128 v[118:121], v68 + v_lshl_add_u32 v67, v134, 1, 0 + ds_read_b128 v[122:125], v67 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[48:51], v[86:87], v[90:91], a[132:135] + ; sched_barrier mask(0x000007F6) + v_lshl_add_u32 v66, v135, 1, 0 + ds_read_b128 v[134:137], v66 + v_mfma_f32_16x16x16_f16 a[136:139], v[88:89], v[92:93], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[94:95], v[90:91], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[112:115], v[96:97], v[92:93], a[48:51] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[48:51], v[86:87], v[118:119], a[92:95] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[88:89], v[120:121], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[94:95], v[118:119], a[96:99] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[132:135], v[96:97], v[120:121], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[98:99], v[90:91], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[100:101], v[92:93], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[114:115], v[90:91], a[104:107] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[104:107], v[116:117], v[92:93], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a48, v252 + v_accvgpr_write_b32 a49, v253 + v_accvgpr_write_b32 a50, v254 + v_accvgpr_write_b32 a51, v255 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[48:51], v[98:99], v[118:119], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[252:255], v[100:101], v[120:121], a[48:51] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[114:115], v[118:119], a[108:111] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[116:117], v[120:121], a[48:51] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[44:47], v[86:87], v[122:123], a[44:47] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[44:47], v[88:89], v[124:125], a[44:47] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[94:95], v[122:123], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[96:97], v[124:125], a[40:43] + ; sched_barrier mask(0x000007F6) + s_nop 4 + v_accvgpr_read_b32 v155, a47 + v_accvgpr_read_b32 v154, a46 + v_accvgpr_read_b32 v153, a45 + v_accvgpr_read_b32 v152, a44 + v_accvgpr_read_b32 v187, a43 + v_accvgpr_read_b32 v186, a42 + v_accvgpr_read_b32 v185, a41 + v_accvgpr_read_b32 v184, a40 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[40:43], v[86:87], v[134:135], a[88:91] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[88:89], v[136:137], a[40:43] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v89, a43 + v_accvgpr_read_b32 v88, a42 + v_accvgpr_read_b32 v87, a41 + v_accvgpr_read_b32 v86, a40 + v_mfma_f32_16x16x16_f16 a[40:43], v[94:95], v[134:135], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[240:243], v[96:97], v[136:137], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[98:99], v[122:123], a[124:127] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[100:101], v[124:125], a[40:43] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v97, a43 + v_accvgpr_read_b32 v96, a42 + v_accvgpr_read_b32 v95, a41 + v_accvgpr_read_b32 v94, a40 + v_mfma_f32_16x16x16_f16 a[40:43], v[114:115], v[122:123], a[76:79] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[192:195], v[116:117], v[124:125], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[98:99], v[134:135], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[196:199], v[100:101], v[136:137], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[114:115], v[134:135], a[84:87] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[200:203], v[116:117], v[136:137], a[40:43] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + v_mfma_f32_16x16x16_f16 a[40:43], v[126:127], v[90:91], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[128:129], v[92:93], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[140:141], v[90:91], a[120:123] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[84:87], v[142:143], v[92:93], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[126:127], v[118:119], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[128:129], v[120:121], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[140:141], v[118:119], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[142:143], v[120:121], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[144:145], v[90:91], a[72:75] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[146:147], v[92:93], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[148:149], v[90:91], a[68:71] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[68:71], v[150:151], v[92:93], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[144:145], v[118:119], a[64:67] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[64:67], v[146:147], v[120:121], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[148:149], v[118:119], a[60:63] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[150:151], v[120:121], a[40:43] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[36:39], v[126:127], v[122:123], a[36:39] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[96:99], v[128:129], v[124:125], a[36:39] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[32:35], v[140:141], v[122:123], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[142:143], v[124:125], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[126:127], v[134:135], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[220:223], v[128:129], v[136:137], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[140:141], v[134:135], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[142:143], v[136:137], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[144:145], v[122:123], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[232:235], v[146:147], v[124:125], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[148:149], v[122:123], a[164:167] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[150:151], v[124:125], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[144:145], v[134:135], a[172:175] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[146:147], v[136:137], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[148:149], v[134:135], a[168:171] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[124:127], v[150:151], v[136:137], a[4:7] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + s_barrier + s_waitcnt vmcnt(9) + ds_write_b128 v132, v[54:57] + s_waitcnt vmcnt(8) + ds_write_b128 v132, v[50:53] offset:4096 + ds_write_b128 v132, v[46:49] offset:8192 + ds_write_b128 v132, v[58:61] offset:12288 + ds_write_b128 v132, v[38:41] offset:16384 + ds_write_b128 v132, v[62:65] offset:20480 + ds_write_b128 v132, v[34:37] offset:24576 + ds_write_b128 v132, v[26:29] offset:28672 + s_waitcnt vmcnt(7) + ds_write_b128 v132, v[30:33] offset:32768 + s_waitcnt vmcnt(6) + ds_write_b128 v131, v[22:25] offset:4096 + s_waitcnt vmcnt(5) + ds_write_b128 v131, v[18:21] offset:8192 + s_waitcnt vmcnt(4) + ds_write_b128 v131, v[14:17] offset:12288 + s_waitcnt vmcnt(3) + ds_write_b128 v131, v[10:13] offset:16384 + s_waitcnt vmcnt(2) + ds_write_b128 v131, v[6:9] offset:20480 + s_waitcnt vmcnt(1) + ds_write_b128 v131, v[2:5] offset:24576 + s_waitcnt vmcnt(0) + ds_write_b128 v131, v[42:45] offset:28672 + ; sched_barrier mask(0x00000000) + s_waitcnt lgkmcnt(0) + s_barrier + ds_read_b128 v[2:5], v133 offset:32768 + ds_read_b128 v[6:9], v1 + ds_read_b128 v[10:13], v0 offset:4096 + ds_read_b128 v[14:17], v1 offset:4096 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[4:7], v[2:3], v[6:7], a[204:207] + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[18:21], v0 offset:8192 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[22:25], v0 offset:12288 + ds_read_b128 v[26:29], v110 offset:32768 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[30:33], v111 offset:4096 + ds_read_b128 v[34:37], v112 offset:32768 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[38:41], v113 offset:4096 + v_mfma_f32_16x16x16_f16 a[36:39], v[4:5], v[8:9], a[4:7] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(7) + v_mfma_f32_16x16x16_f16 a[4:7], v[10:11], v[6:7], a[208:211] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[120:123], v[12:13], v[8:9], a[4:7] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16_f16 a[4:7], v[2:3], v[14:15], a[212:215] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[208:211], v[4:5], v[16:17], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[10:11], v[14:15], a[216:219] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[40:43], v[12:13], v[16:17], a[4:7] + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], v[6:7], a[244:247] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[44:47], v[20:21], v[8:9], a[4:7] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16_f16 a[4:7], v[22:23], v[6:7], a[228:231] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[24:25], v[8:9], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], v[14:15], a[248:251] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[4:7], v[20:21], v[16:17], a[4:7] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[32:35], v[22:23], v[14:15], a[236:239] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[88:91], v[24:25], v[16:17], a[32:35] + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16_f16 a[32:35], v[26:27], v[6:7], a[188:191] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[28:29], v[8:9], a[32:35] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[24:27], v[30:31], v[6:7], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[164:167], v[32:33], v[8:9], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[24:27], v[26:27], v[14:15], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[204:207], v[28:29], v[16:17], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[20:23], v[30:31], v[14:15], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[172:175], v[32:33], v[16:17], a[20:23] + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[20:23], v[34:35], v[6:7], a[160:163] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[168:171], v[36:37], v[8:9], a[20:23] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[8:11], v[38:39], v[6:7], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[160:163], v[40:41], v[8:9], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[34:35], v[14:15], a[12:15] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[244:247], v[36:37], v[16:17], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[8:11], v[38:39], v[14:15], a[16:19] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[236:239], v[40:41], v[16:17], a[8:11] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + ds_read_b128 v[6:9], v1 offset:8192 + ds_read_b128 v[14:17], v1 offset:12288 + s_nop 3 + v_accvgpr_write_b32 a8, v102 + v_accvgpr_write_b32 a9, v103 + v_accvgpr_write_b32 a10, v104 + v_accvgpr_write_b32 a11, v105 + v_accvgpr_write_b32 a12, v156 + v_accvgpr_write_b32 a13, v157 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[8:11], v[2:3], v[6:7], a[8:11] + v_accvgpr_write_b32 a14, v158 + v_accvgpr_write_b32 a15, v159 + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a16, v160 + v_mfma_f32_16x16x16_f16 a[8:11], v[4:5], v[8:9], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a17, v161 + v_accvgpr_write_b32 a18, v162 + v_accvgpr_write_b32 a19, v163 + v_mfma_f32_16x16x16_f16 a[12:15], v[10:11], v[6:7], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a20, v164 + v_accvgpr_write_b32 a21, v165 + v_accvgpr_write_b32 a22, v166 + v_mfma_f32_16x16x16_f16 a[12:15], v[12:13], v[8:9], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a23, v167 + v_accvgpr_write_b32 a24, v168 + v_accvgpr_write_b32 a25, v169 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[16:19], v[2:3], v[14:15], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a26, v170 + v_accvgpr_write_b32 a27, v171 + v_accvgpr_write_b32 a28, v172 + v_mfma_f32_16x16x16_f16 a[16:19], v[4:5], v[16:17], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a29, v173 + v_accvgpr_write_b32 a30, v174 + v_accvgpr_write_b32 a31, v175 + v_mfma_f32_16x16x16_f16 a[20:23], v[10:11], v[14:15], a[20:23] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a32, v176 + v_accvgpr_write_b32 a33, v177 + v_accvgpr_write_b32 a34, v178 + v_mfma_f32_16x16x16_f16 a[20:23], v[12:13], v[16:17], a[20:23] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a35, v179 + v_accvgpr_write_b32 a52, v180 + v_accvgpr_write_b32 a53, v181 + v_mfma_f32_16x16x16_f16 a[24:27], v[18:19], v[6:7], a[24:27] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a54, v182 + v_accvgpr_write_b32 a55, v183 + v_mfma_f32_16x16x16_f16 a[24:27], v[20:21], v[8:9], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[22:23], v[6:7], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[24:25], v[8:9], a[28:31] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[32:35], v[18:19], v[14:15], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[32:35], v[20:21], v[16:17], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[22:23], v[14:15], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[52:55], v[24:25], v[16:17], a[52:55] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[26:27], v[6:7], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[228:231], v[28:29], v[8:9], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[140:143], v[30:31], v[6:7], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[140:143], v[32:33], v[8:9], a[140:143] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[26:27], v[14:15], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[144:147], v[28:29], v[16:17], a[144:147] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[30:31], v[14:15], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[156:159], v[32:33], v[16:17], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[34:35], v[6:7], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[176:179], v[36:37], v[8:9], a[176:179] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[38:39], v[6:7], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[180:183], v[40:41], v[8:9], a[180:183] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[34:35], v[14:15], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[148:151], v[36:37], v[16:17], a[148:151] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[38:39], v[14:15], a[152:155] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[248:251], v[40:41], v[16:17], a[152:155] + ; sched_barrier mask(0x000007F6) + ; sched_barrier mask(0x00000000) + ds_read_b128 v[6:9], v82 + ds_read_b128 v[14:17], v84 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[136:139], v[2:3], v[6:7], a[136:139] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[216:219], v[4:5], v[8:9], a[136:139] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[112:115], v[10:11], v[6:7], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[212:215], v[12:13], v[8:9], a[112:115] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[112:115], v[2:3], v[14:15], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[112:115], v[4:5], v[16:17], a[112:115] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[10:11], v[14:15], a[132:135] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[136:139], v[12:13], v[16:17], a[128:131] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[18:19], v[6:7], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[128:131], v[20:21], v[8:9], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[22:23], v[6:7], a[104:107] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[104:107], v[24:25], v[8:9], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[18:19], v[14:15], a[252:255] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[100:103], v[20:21], v[16:17], a[100:103] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[22:23], v[14:15], a[108:111] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[108:111], v[24:25], v[16:17], a[108:111] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[26:27], v[6:7], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[132:135], v[28:29], v[8:9], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[30:31], v[6:7], a[84:87] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[252:255], v[32:33], v[8:9], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[26:27], v[14:15], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[28:29], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v115, a59 + v_accvgpr_read_b32 v114, a58 + v_accvgpr_read_b32 v113, a57 + v_accvgpr_read_b32 v112, a56 + v_mfma_f32_16x16x16_f16 a[56:59], v[30:31], v[14:15], a[76:79] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[84:87], v[32:33], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[34:35], v[6:7], a[72:75] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[36:37], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v119, a59 + v_accvgpr_read_b32 v118, a58 + v_accvgpr_read_b32 v117, a57 + v_accvgpr_read_b32 v116, a56 + v_mfma_f32_16x16x16_f16 a[56:59], v[38:39], v[6:7], a[68:71] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[40:41], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v237, a59 + v_accvgpr_read_b32 v236, a58 + v_accvgpr_read_b32 v235, a57 + v_accvgpr_read_b32 v234, a56 + v_mfma_f32_16x16x16_f16 a[56:59], v[34:35], v[14:15], a[64:67] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[36:37], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v233, a59 + v_accvgpr_read_b32 v232, a58 + v_accvgpr_read_b32 v231, a57 + v_accvgpr_read_b32 v230, a56 + v_mfma_f32_16x16x16_f16 a[56:59], v[38:39], v[14:15], a[60:63] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[40:41], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v229, a59 + v_accvgpr_read_b32 v228, a58 + v_accvgpr_read_b32 v227, a57 + v_accvgpr_read_b32 v226, a56 + ; sched_barrier mask(0x00000000) + ds_read_b128 v[6:9], v83 + ds_read_b128 v[14:17], v85 + v_accvgpr_write_b32 a56, v152 + v_accvgpr_write_b32 a57, v153 + v_accvgpr_write_b32 a58, v154 + v_accvgpr_write_b32 a59, v155 + s_waitcnt lgkmcnt(1) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[56:59], v[2:3], v[6:7], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[60:63], v[4:5], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a56, v184 + v_accvgpr_write_b32 a57, v185 + v_accvgpr_write_b32 a58, v186 + v_accvgpr_write_b32 a59, v187 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[56:59], v[10:11], v[6:7], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[64:67], v[12:13], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a56, v86 + v_accvgpr_write_b32 a57, v87 + v_accvgpr_write_b32 a58, v88 + v_accvgpr_write_b32 a59, v89 + s_waitcnt lgkmcnt(0) + s_nop 0 + v_mfma_f32_16x16x16_f16 a[56:59], v[2:3], v[14:15], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[184:187], v[4:5], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[10:11], v[14:15], a[240:243] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[80:83], v[12:13], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + s_nop 5 + v_accvgpr_write_b32 a56, v94 + v_accvgpr_write_b32 a57, v95 + v_accvgpr_write_b32 a58, v96 + v_accvgpr_write_b32 a59, v97 + s_nop 1 + v_mfma_f32_16x16x16_f16 a[56:59], v[18:19], v[6:7], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[152:155], v[20:21], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[22:23], v[6:7], a[192:195] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[72:75], v[24:25], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[18:19], v[14:15], a[196:199] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[68:71], v[20:21], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[22:23], v[14:15], a[200:203] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[76:79], v[24:25], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[26:27], v[6:7], a[96:99] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[240:243], v[28:29], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[30:31], v[6:7], a[92:95] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[196:199], v[32:33], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[26:27], v[14:15], a[220:223] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[192:195], v[28:29], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[30:31], v[14:15], a[224:227] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[220:223], v[32:33], v[16:17], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[34:35], v[6:7], a[232:235] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[200:203], v[36:37], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[38:39], v[6:7], a[116:119] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[116:119], v[40:41], v[8:9], a[56:59] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[34:35], v[14:15], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[92:95], v[36:37], v[16:17], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[38:39], v[14:15], a[124:127] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[0:3], v[40:41], v[16:17], a[0:3] + ; sched_barrier mask(0x000007F6) + s_nop 6 + v_accvgpr_read_b32 v105, a3 + v_accvgpr_read_b32 v104, a2 + v_accvgpr_read_b32 v103, a1 + v_accvgpr_read_b32 v102, a0 + ; sched_barrier mask(0x00000000) + ds_read_b128 v[20:23], v80 offset:32768 + ds_read_b128 v[32:35], v81 + ds_read_b128 v[28:31], v76 offset:4096 + ds_read_b128 v[36:39], v75 + ds_read_b128 v[16:19], v79 offset:32768 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[24:27], v77 offset:4096 + ds_read_b128 v[46:49], v78 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[58:61], v74 + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16_f16 a[0:3], v[20:21], v[32:33], a[36:39] + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[8:11], v73 offset:32768 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[12:15], v71 offset:32768 + ds_read_b128 v[4:7], v72 offset:32768 + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[0:3], v70 offset:32768 + ds_read_b128 v[218:221], v69 + ds_read_b128 v[222:225], v68 + v_mfma_f32_16x16x16_f16 a[0:3], v[22:23], v[34:35], a[0:3] + ; sched_barrier mask(0x000007F6) + ds_read_b128 v[252:255], v67 + s_mul_i32 s0, s15, s4 + s_ashr_i32 s1, s0, 31 + s_waitcnt lgkmcnt(12) + v_mfma_f32_16x16x16_f16 a[36:39], v[28:29], v[32:33], a[120:123] + ; sched_barrier mask(0x000007F6) + s_lshl_b64 s[0:1], s[0:1], 1 + s_add_u32 s2, s6, s0 + s_addc_u32 s3, s7, s1 + v_mfma_f32_16x16x16_f16 a[232:235], v[30:31], v[34:35], a[36:39] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v94, a0 + v_accvgpr_read_b32 v95, a1 + v_accvgpr_read_b32 v99, a2 + s_waitcnt lgkmcnt(11) + v_mfma_f32_16x16x16_f16 a[36:39], v[20:21], v[36:37], a[208:211] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v98, a3 + v_cvt_f16_f32_e32 v94, v94 + v_cvt_f16_f32_e32 v99, v99 + v_mfma_f32_16x16x16_f16 a[120:123], v[22:23], v[38:39], a[36:39] + ; sched_barrier mask(0x000007F6) + v_cvt_f16_f32_e32 v95, v95 + s_ashr_i32 s13, s12, 31 + s_lshl_b64 s[0:1], s[12:13], 1 + v_mfma_f32_16x16x16_f16 a[36:39], v[28:29], v[36:37], a[40:43] + ; sched_barrier mask(0x000007F6) + s_add_u32 s0, s2, s0 + s_addc_u32 s1, s3, s1 + v_accvgpr_read_b32 v91, a232 + v_mfma_f32_16x16x16_f16 a[96:99], v[30:31], v[38:39], a[36:39] + v_accvgpr_read_b32 v84, a120 + v_accvgpr_read_b32 v82, a121 + v_accvgpr_read_b32 v89, a122 + s_waitcnt lgkmcnt(10) + v_mfma_f32_16x16x16_f16 a[36:39], v[16:17], v[32:33], a[44:47] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v90, a123 + v_cvt_f16_f32_e32 v84, v84 + v_cvt_f16_f32_e32 v89, v89 + v_mfma_f32_16x16x16_f16 a[40:43], v[18:19], v[34:35], a[36:39] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v79, a96 + v_accvgpr_read_b32 v83, a97 + v_accvgpr_read_b32 v86, a98 + s_waitcnt lgkmcnt(9) + v_mfma_f32_16x16x16_f16 a[36:39], v[24:25], v[32:33], a[48:51] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v87, a99 + v_cvt_f16_f32_e32 v90, v90 + v_cvt_f16_f32_e32 v82, v82 + v_mfma_f32_16x16x16_f16 a[36:39], v[26:27], v[34:35], a[36:39] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v80, a40 + v_accvgpr_read_b32 v78, a41 + v_accvgpr_read_b32 v81, a42 + v_mfma_f32_16x16x16_f16 a[4:7], v[16:17], v[36:37], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v85, a43 + v_accvgpr_read_b32 v88, a233 + v_accvgpr_read_b32 v92, a234 + v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], v[38:39], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v74, a36 + v_accvgpr_read_b32 v70, a37 + v_accvgpr_read_b32 v76, a38 + v_mfma_f32_16x16x16_f16 a[44:47], v[24:25], v[36:37], a[88:91] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v77, a39 + v_accvgpr_read_b32 v93, a235 + s_and_b32 s1, s1, 0xffff + s_waitcnt lgkmcnt(8) + v_mfma_f32_16x16x16_f16 a[8:11], v[20:21], v[46:47], a[8:11] + v_accvgpr_read_b32 v56, a4 + v_accvgpr_read_b32 v57, a5 + v_accvgpr_read_b32 v64, a6 + v_mfma_f32_16x16x16_f16 a[48:51], v[26:27], v[38:39], a[44:47] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v65, a7 + s_mov_b32 s3, 0x27000 + s_mov_b32 s2, 0x7ffffffe + v_mfma_f32_16x16x16_f16 a[88:91], v[22:23], v[48:49], a[8:11] + ; sched_barrier mask(0x000007F6) + v_cvt_f16_f32_e32 v86, v86 + v_cvt_f16_f32_e32 v87, v87 + v_cvt_f16_f32_e32 v81, v81 + v_mfma_f32_16x16x16_f16 a[8:11], v[28:29], v[46:47], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v71, a48 + v_accvgpr_read_b32 v72, a49 + v_accvgpr_read_b32 v73, a50 + v_mfma_f32_16x16x16_f16 a[44:47], v[30:31], v[48:49], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v75, a51 + v_pack_b32_f16 v87, v86, v87 + v_cvt_f16_f32_e32 v85, v85 + s_waitcnt lgkmcnt(7) + v_mfma_f32_16x16x16_f16 a[8:11], v[20:21], v[58:59], a[16:19] + ; sched_barrier mask(0x000007F6) + v_cvt_f16_f32_e32 v64, v64 + v_cvt_f16_f32_e32 v65, v65 + v_cvt_f16_f32_e32 v76, v76 + v_mfma_f32_16x16x16_f16 a[12:15], v[22:23], v[60:61], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v40, a47 + v_cvt_f16_f32_e32 v77, v77 + v_cvt_f16_f32_e32 v56, v56 + v_mfma_f32_16x16x16_f16 a[8:11], v[28:29], v[58:59], a[20:23] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[224:227], v[30:31], v[60:61], a[8:11] + ; sched_barrier mask(0x000007F6) + s_nop 1 + v_accvgpr_read_b32 v44, a12 + v_accvgpr_read_b32 v45, a13 + v_mfma_f32_16x16x16_f16 a[8:11], v[16:17], v[46:47], a[24:27] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[208:211], v[18:19], v[48:49], a[8:11] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v50, a224 + v_accvgpr_read_b32 v51, a225 + v_accvgpr_read_b32 v52, a226 + v_mfma_f32_16x16x16_f16 a[8:11], v[24:25], v[46:47], a[28:31] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v53, a227 + v_mfma_f32_16x16x16_f16 a[124:127], v[26:27], v[48:49], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v41, a209 + v_accvgpr_read_b32 v42, a210 + v_accvgpr_read_b32 v43, a211 + v_mfma_f32_16x16x16_f16 a[8:11], v[16:17], v[58:59], a[32:35] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[56:59], v[18:19], v[60:61], a[8:11] + ; sched_barrier mask(0x000007F6) + s_nop 1 + v_accvgpr_read_b32 v54, a126 + v_accvgpr_read_b32 v55, a127 + v_mfma_f32_16x16x16_f16 a[8:11], v[24:25], v[58:59], a[52:55] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(6) + v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], v[32:33], a[188:191] + v_mfma_f32_16x16x16_f16 a[8:11], v[26:27], v[60:61], a[8:11] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[188:191], v[10:11], v[34:35], a[16:19] + ; sched_barrier mask(0x000007F6) + s_waitcnt lgkmcnt(5) + v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[32:33], a[164:167] + ; sched_barrier mask(0x000007F6) + s_nop 3 + v_accvgpr_read_b32 v62, a8 + v_mfma_f32_16x16x16_f16 a[24:27], v[14:15], v[34:35], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v63, a9 + v_accvgpr_read_b32 v68, a10 + v_accvgpr_read_b32 v69, a11 + v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], v[36:37], a[204:207] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v128, a188 + v_accvgpr_read_b32 v133, a189 + v_accvgpr_read_b32 v134, a190 + v_mfma_f32_16x16x16_f16 a[20:23], v[10:11], v[38:39], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v149, a24 + v_accvgpr_read_b32 v150, a25 + v_accvgpr_read_b32 v151, a26 + v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[36:37], a[172:175] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v161, a27 + v_accvgpr_read_b32 v135, a191 + v_mfma_f32_16x16x16_f16 a[172:175], v[14:15], v[38:39], a[16:19] + v_accvgpr_read_b32 v162, a20 + v_accvgpr_read_b32 v163, a21 + v_accvgpr_read_b32 v180, a22 + s_waitcnt lgkmcnt(4) + v_mfma_f32_16x16x16_f16 a[16:19], v[4:5], v[32:33], a[168:171] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v181, a23 + v_mfma_f32_16x16x16_f16 a[164:167], v[6:7], v[34:35], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v182, a172 + v_accvgpr_read_b32 v191, a173 + v_accvgpr_read_b32 v192, a174 + s_waitcnt lgkmcnt(3) + v_mfma_f32_16x16x16_f16 a[16:19], v[0:1], v[32:33], a[160:163] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v193, a175 + v_accvgpr_read_b32 v32, a88 + v_accvgpr_read_b32 v33, a89 + v_mfma_f32_16x16x16_f16 a[32:35], v[2:3], v[34:35], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v157, a166 + v_accvgpr_read_b32 v168, a167 + v_accvgpr_read_b32 v155, a164 + v_mfma_f32_16x16x16_f16 a[16:19], v[4:5], v[36:37], a[244:247] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v156, a165 + v_accvgpr_read_b32 v34, a90 + v_accvgpr_read_b32 v35, a91 + v_mfma_f32_16x16x16_f16 a[168:171], v[6:7], v[38:39], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v169, a32 + v_accvgpr_read_b32 v170, a33 + v_accvgpr_read_b32 v186, a34 + v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[36:37], a[236:239] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v187, a35 + v_accvgpr_read_b32 v36, a44 + v_cvt_f16_f32_e32 v32, v32 + v_mfma_f32_16x16x16_f16 a[52:55], v[2:3], v[38:39], a[0:3] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v38, a45 + v_accvgpr_read_b32 v39, a46 + v_accvgpr_read_b32 v188, a168 + v_mfma_f32_16x16x16_f16 a[0:3], v[8:9], v[46:47], a[228:231] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v197, a169 + v_accvgpr_read_b32 v198, a170 + v_accvgpr_read_b32 v199, a171 + v_mfma_f32_16x16x16_f16 a[0:3], v[10:11], v[48:49], a[0:3] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v209, a52 + v_accvgpr_read_b32 v210, a53 + v_accvgpr_read_b32 v211, a54 + v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[46:47], a[140:143] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v212, a55 + v_accvgpr_read_b32 v37, a208 + v_mfma_f32_16x16x16_f16 a[160:163], v[14:15], v[48:49], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v96, a0 + v_accvgpr_read_b32 v97, a1 + v_accvgpr_read_b32 v100, a2 + v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], v[58:59], a[144:147] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v101, a3 + v_mfma_f32_16x16x16_f16 a[16:19], v[10:11], v[60:61], a[16:19] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v110, a161 + v_accvgpr_read_b32 v140, a162 + v_accvgpr_read_b32 v141, a163 + v_mfma_f32_16x16x16_f16 a[4:7], v[12:13], v[58:59], a[156:159] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[14:15], v[60:61], a[4:7] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v142, a16 + v_accvgpr_read_b32 v143, a17 + v_accvgpr_read_b32 v144, a18 + v_mfma_f32_16x16x16_f16 a[4:7], v[4:5], v[46:47], a[176:179] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v145, a19 + v_mfma_f32_16x16x16_f16 a[48:51], v[6:7], v[48:49], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v152, a28 + v_accvgpr_read_b32 v153, a29 + v_accvgpr_read_b32 v154, a30 + v_mfma_f32_16x16x16_f16 a[4:7], v[0:1], v[46:47], a[180:183] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v46, a14 + v_accvgpr_read_b32 v164, a31 + v_accvgpr_read_b32 v47, a124 + v_mfma_f32_16x16x16_f16 a[96:99], v[2:3], v[48:49], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v49, a15 + v_accvgpr_read_b32 v123, a48 + v_accvgpr_read_b32 v124, a49 + v_mfma_f32_16x16x16_f16 a[4:7], v[4:5], v[58:59], a[148:151] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v129, a50 + v_accvgpr_read_b32 v131, a51 + v_accvgpr_read_b32 v48, a125 + v_mfma_f32_16x16x16_f16 a[120:123], v[6:7], v[60:61], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v132, a96 + v_accvgpr_read_b32 v137, a97 + v_accvgpr_read_b32 v138, a98 + v_mfma_f32_16x16x16_f16 a[4:7], v[0:1], v[58:59], a[248:251] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v58, a56 + v_accvgpr_read_b32 v59, a57 + v_accvgpr_read_b32 v139, a99 + v_mfma_f32_16x16x16_f16 a[140:143], v[2:3], v[60:61], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v60, a58 + v_accvgpr_read_b32 v61, a59 + v_accvgpr_read_b32 v158, a120 + s_waitcnt lgkmcnt(2) + v_mfma_f32_16x16x16_f16 a[4:7], v[20:21], v[218:219], a[216:219] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v159, a121 + v_accvgpr_read_b32 v160, a122 + v_accvgpr_read_b32 v171, a123 + v_mfma_f32_16x16x16_f16 a[12:15], v[22:23], v[220:221], a[4:7] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v172, a140 + v_accvgpr_read_b32 v173, a141 + v_accvgpr_read_b32 v189, a142 + v_mfma_f32_16x16x16_f16 a[4:7], v[28:29], v[218:219], a[212:215] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v190, a143 + v_mfma_f32_16x16x16_f16 a[4:7], v[30:31], v[220:221], a[4:7] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v136, a12 + v_accvgpr_read_b32 v146, a13 + v_accvgpr_read_b32 v147, a14 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[8:11], v[20:21], v[222:223], a[112:115] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v148, a15 + v_mfma_f32_16x16x16_f16 a[8:11], v[22:23], v[224:225], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v165, a4 + v_accvgpr_read_b32 v166, a5 + v_accvgpr_read_b32 v167, a6 + v_mfma_f32_16x16x16_f16 a[24:27], v[28:29], v[222:223], a[136:139] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v177, a7 + v_mfma_f32_16x16x16_f16 a[20:23], v[30:31], v[224:225], a[24:27] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v178, a8 + v_accvgpr_read_b32 v179, a9 + v_accvgpr_read_b32 v194, a10 + v_mfma_f32_16x16x16_f16 a[24:27], v[16:17], v[218:219], a[128:131] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v195, a11 + v_mfma_f32_16x16x16_f16 a[24:27], v[18:19], v[220:221], a[24:27] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v196, a20 + v_accvgpr_read_b32 v203, a21 + v_accvgpr_read_b32 v204, a22 + v_mfma_f32_16x16x16_f16 a[36:39], v[24:25], v[218:219], a[104:107] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v205, a23 + v_mfma_f32_16x16x16_f16 a[32:35], v[26:27], v[220:221], a[36:39] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v174, a24 + v_accvgpr_read_b32 v175, a25 + v_accvgpr_read_b32 v176, a26 + v_mfma_f32_16x16x16_f16 a[36:39], v[16:17], v[222:223], a[100:103] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v183, a27 + v_mfma_f32_16x16x16_f16 a[36:39], v[18:19], v[224:225], a[36:39] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v184, a32 + v_accvgpr_read_b32 v185, a33 + v_accvgpr_read_b32 v200, a34 + v_mfma_f32_16x16x16_f16 a[40:43], v[24:25], v[222:223], a[108:111] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v201, a35 + v_accvgpr_write_b32 a32, v234 + v_accvgpr_write_b32 a33, v235 + v_mfma_f32_16x16x16_f16 a[0:3], v[26:27], v[224:225], a[40:43] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a34, v236 + v_accvgpr_write_b32 a35, v237 + v_accvgpr_read_b32 v202, a36 + ds_read_b128 a[40:43], v66 + s_waitcnt lgkmcnt(1) + v_mfma_f32_16x16x16_f16 a[44:47], v[20:21], v[252:253], a[60:63] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v67, a160 + v_accvgpr_read_b32 v206, a37 + v_accvgpr_read_b32 v213, a0 + v_mfma_f32_16x16x16_f16 a[44:47], v[22:23], v[254:255], a[44:47] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v214, a1 + v_accvgpr_read_b32 v215, a2 + v_accvgpr_read_b32 v216, a3 + v_mfma_f32_16x16x16_f16 a[16:19], v[28:29], v[252:253], a[64:67] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a0, v112 + v_accvgpr_write_b32 a1, v113 + v_accvgpr_write_b32 a2, v114 + v_mfma_f32_16x16x16_f16 a[16:19], v[30:31], v[254:255], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a3, v115 + v_accvgpr_read_b32 v207, a38 + v_accvgpr_read_b32 v208, a39 + s_waitcnt lgkmcnt(0) + v_mfma_f32_16x16x16_f16 a[28:31], v[20:21], a[40:41], a[184:187] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[28:31], v[22:23], a[42:43], a[28:31] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v20, a16 + v_accvgpr_read_b32 v22, a17 + v_accvgpr_read_b32 v23, a18 + v_mfma_f32_16x16x16_f16 a[48:51], v[28:29], a[40:41], a[80:83] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[48:51], v[30:31], a[42:43], a[48:51] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v28, a28 + v_accvgpr_read_b32 v29, a29 + v_accvgpr_read_b32 v30, a30 + v_mfma_f32_16x16x16_f16 a[52:55], v[16:17], v[252:253], a[152:155] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v112, a31 + v_accvgpr_write_b32 a28, v116 + v_accvgpr_write_b32 a29, v117 + v_mfma_f32_16x16x16_f16 a[52:55], v[18:19], v[254:255], a[52:55] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a30, v118 + v_accvgpr_write_b32 a31, v119 + v_accvgpr_read_b32 v113, a48 + v_mfma_f32_16x16x16_f16 a[12:15], v[24:25], v[252:253], a[72:75] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v114, a49 + v_accvgpr_read_b32 v118, a50 + v_accvgpr_read_b32 v119, a51 + v_mfma_f32_16x16x16_f16 a[12:15], v[26:27], v[254:255], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v21, a52 + v_mfma_f32_16x16x16_f16 a[4:7], v[16:17], a[40:41], a[68:71] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v16, a44 + v_accvgpr_read_b32 v17, a45 + v_cvt_f16_f32_e32 v16, v16 + v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], a[42:43], a[4:7] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v31, a12 + v_accvgpr_read_b32 v66, a13 + v_accvgpr_read_b32 v111, a14 + v_mfma_f32_16x16x16_f16 a[8:11], v[24:25], a[40:41], a[76:79] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v24, a19 + v_accvgpr_read_b32 v115, a15 + v_accvgpr_write_b32 a12, v230 + v_mfma_f32_16x16x16_f16 a[8:11], v[26:27], a[42:43], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a13, v231 + v_accvgpr_write_b32 a14, v232 + v_accvgpr_write_b32 a15, v233 + v_mfma_f32_16x16x16_f16 a[20:23], v[8:9], v[218:219], a[132:135] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v116, a4 + v_accvgpr_read_b32 v117, a5 + v_accvgpr_read_b32 v120, a6 + v_mfma_f32_16x16x16_f16 a[20:23], v[10:11], v[220:221], a[20:23] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v122, a8 + v_accvgpr_read_b32 v125, a9 + v_accvgpr_read_b32 v126, a10 + v_mfma_f32_16x16x16_f16 a[24:27], v[12:13], v[218:219], a[252:255] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v127, a11 + v_accvgpr_write_b32 a8, v226 + v_accvgpr_write_b32 a9, v227 + v_mfma_f32_16x16x16_f16 a[24:27], v[14:15], v[220:221], a[24:27] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a10, v228 + v_accvgpr_write_b32 a11, v229 + v_accvgpr_read_b32 v121, a7 + v_mfma_f32_16x16x16_f16 a[0:3], v[8:9], v[222:223], a[0:3] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v18, a46 + v_accvgpr_read_b32 v19, a47 + v_accvgpr_read_b32 v25, a53 + v_mfma_f32_16x16x16_f16 a[0:3], v[10:11], v[224:225], a[0:3] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v227, a26 + v_accvgpr_read_b32 v228, a27 + v_accvgpr_read_b32 v26, a54 + v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[222:223], a[84:87] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v27, a55 + v_mfma_f32_16x16x16_f16 a[16:19], v[14:15], v[224:225], a[16:19] + ; sched_barrier mask(0x000007F6) + s_nop 0 + v_accvgpr_read_b32 v229, a0 + v_accvgpr_read_b32 v233, a1 + v_accvgpr_read_b32 v234, a2 + v_mfma_f32_16x16x16_f16 a[28:31], v[4:5], v[218:219], a[28:31] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v235, a3 + v_mfma_f32_16x16x16_f16 a[28:31], v[6:7], v[220:221], a[28:31] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v239, a16 + v_accvgpr_read_b32 v240, a17 + v_accvgpr_read_b32 v241, a18 + v_mfma_f32_16x16x16_f16 a[32:35], v[0:1], v[218:219], a[32:35] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v245, a19 + v_accvgpr_read_b32 v219, a20 + v_mfma_f32_16x16x16_f16 a[32:35], v[2:3], v[220:221], a[32:35] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v220, a21 + v_accvgpr_read_b32 v221, a22 + v_accvgpr_read_b32 v226, a29 + v_mfma_f32_16x16x16_f16 a[12:15], v[4:5], v[222:223], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v230, a30 + v_accvgpr_read_b32 v231, a31 + v_mfma_f32_16x16x16_f16 a[4:7], v[6:7], v[224:225], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v232, a32 + v_accvgpr_read_b32 v236, a33 + v_accvgpr_read_b32 v237, a34 + v_mfma_f32_16x16x16_f16 a[8:11], v[0:1], v[222:223], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v222, a23 + v_accvgpr_read_b32 v223, a24 + v_accvgpr_read_b32 v238, a35 + v_mfma_f32_16x16x16_f16 a[8:11], v[2:3], v[224:225], a[8:11] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v242, a4 + v_accvgpr_read_b32 v243, a5 + v_accvgpr_read_b32 v244, a6 + v_mfma_f32_16x16x16_f16 a[12:15], v[8:9], v[252:253], a[240:243] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v246, a7 + v_accvgpr_read_b32 v224, a25 + v_accvgpr_read_b32 v225, a28 + v_mfma_f32_16x16x16_f16 a[12:15], v[10:11], v[254:255], a[12:15] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v247, a8 + v_accvgpr_read_b32 v248, a9 + v_accvgpr_read_b32 v249, a10 + v_mfma_f32_16x16x16_f16 a[0:3], v[12:13], v[252:253], a[196:199] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v250, a11 + v_mfma_f32_16x16x16_f16 a[0:3], v[14:15], v[254:255], a[0:3] + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], a[40:41], a[192:195] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v8, a12 + v_accvgpr_read_b32 v9, a13 + v_cvt_f16_f32_e32 v8, v8 + v_mfma_f32_16x16x16_f16 a[16:19], v[10:11], a[42:43], a[16:19] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v10, a14 + v_accvgpr_read_b32 v11, a15 + v_accvgpr_write_b32 a12, v102 + v_mfma_f32_16x16x16_f16 a[20:23], v[12:13], a[40:41], a[220:223] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v12, a0 + v_accvgpr_read_b32 v13, a1 + v_accvgpr_write_b32 a13, v103 + v_mfma_f32_16x16x16_f16 a[20:23], v[14:15], a[42:43], a[20:23] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v14, a2 + v_accvgpr_read_b32 v15, a3 + v_accvgpr_write_b32 a14, v104 + v_mfma_f32_16x16x16_f16 a[4:7], v[4:5], v[252:253], a[200:203] + ; sched_barrier mask(0x000007F6) + v_accvgpr_write_b32 a15, v105 + v_or_b32_e32 v102, 0x80, v108 + v_mov_b32_e32 v103, v102 + v_mfma_f32_16x16x16_f16 a[4:7], v[6:7], v[254:255], a[4:7] + ; sched_barrier mask(0x000007F6) + v_cvt_f16_f32_e32 v9, v9 + v_cvt_f16_f32_e32 v10, v10 + v_cvt_f16_f32_e32 v11, v11 + v_mfma_f32_16x16x16_f16 a[8:11], v[0:1], v[252:253], a[116:119] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v217, a16 + v_accvgpr_read_b32 v218, a17 + v_cvt_f16_f32_e32 v12, v12 + v_mfma_f32_16x16x16_f16 a[8:11], v[2:3], v[254:255], a[8:11] + ; sched_barrier mask(0x000007F6) + v_cvt_f16_f32_e32 v13, v13 + v_cvt_f16_f32_e32 v14, v14 + v_cvt_f16_f32_e32 v15, v15 + v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], a[40:41], a[92:95] + ; sched_barrier mask(0x000007F6) + v_accvgpr_read_b32 v4, a18 + v_accvgpr_read_b32 v5, a19 + v_mfma_f32_16x16x16_f16 a[0:3], v[6:7], a[42:43], a[0:3] + v_cvt_f16_f32_e32 v6, v98 + ; sched_barrier mask(0x000007F6) + v_cvt_f16_f32_e32 v98, v162 + v_pack_b32_f16 v7, v99, v6 + v_mfma_f32_16x16x16_f16 a[12:15], v[0:1], a[40:41], a[12:15] + v_pack_b32_f16 v6, v94, v95 + v_mul_lo_u32 v94, s4, v130 + v_add_lshl_u32 v1, v94, v108, 1 + ; sched_barrier mask(0x000007F6) + v_mfma_f32_16x16x16_f16 a[12:15], v[2:3], a[42:43], a[12:15] + ; sched_barrier mask(0x00000406) + ; sched_barrier mask(0x00000406) + buffer_store_dwordx2 v[6:7], v1, s[0:3], 0 offen + v_cvt_f16_f32_e32 v1, v91 + v_cvt_f16_f32_e32 v2, v92 + v_cvt_f16_f32_e32 v3, v93 + v_cvt_f16_f32_e32 v6, v88 + v_pack_b32_f16 v93, v89, v90 + v_pack_b32_f16 v92, v84, v82 + v_cvt_f16_f32_e32 v84, v79 + v_cvt_f16_f32_e32 v90, v83 + s_lshl_b32 s4, s4, 5 + v_add_u32_e32 v89, s4, v94 + v_pack_b32_f16 v3, v2, v3 + v_pack_b32_f16 v2, v1, v6 + v_add_lshl_u32 v6, v94, v109, 1 + v_add_lshl_u32 v82, v89, v108, 1 + v_pack_b32_f16 v86, v84, v90 + v_add_lshl_u32 v90, v89, v109, 1 + buffer_store_dwordx2 v[2:3], v6, s[0:3], 0 offen + buffer_store_dwordx2 v[92:93], v82, s[0:3], 0 offen + buffer_store_dwordx2 v[86:87], v90, s[0:3], 0 offen + v_cvt_f16_f32_e32 v87, v80 + v_cvt_f16_f32_e32 v90, v78 + v_pack_b32_f16 v93, v81, v85 + v_add_lshl_u32 v85, v94, v107, 1 + v_cvt_f16_f32_e32 v95, v161 + v_pack_b32_f16 v92, v87, v90 + v_cvt_f16_f32_e32 v87, v74 + v_cvt_f16_f32_e32 v90, v70 + buffer_store_dwordx2 v[92:93], v85, s[0:3], 0 offen + v_pack_b32_f16 v93, v76, v77 + v_add_lshl_u32 v77, v94, v106, 1 + v_pack_b32_f16 v92, v87, v90 + v_cvt_f16_f32_e32 v87, v57 + v_pack_b32_f16 v57, v64, v65 + v_cvt_f16_f32_e32 v64, v71 + v_cvt_f16_f32_e32 v71, v72 + v_cvt_f16_f32_e32 v65, v73 + v_cvt_f16_f32_e32 v72, v75 + v_pack_b32_f16 v56, v56, v87 + v_add_lshl_u32 v73, v89, v107, 1 + v_pack_b32_f16 v64, v64, v71 + v_pack_b32_f16 v65, v65, v72 + v_add_lshl_u32 v71, v89, v106, 1 + buffer_store_dwordx2 v[92:93], v77, s[0:3], 0 offen + buffer_store_dwordx2 v[56:57], v73, s[0:3], 0 offen + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_cvt_f16_f32_e32 v64, v128 + v_cvt_f16_f32_e32 v71, v133 + v_cvt_f16_f32_e32 v65, v134 + v_cvt_f16_f32_e32 v72, v135 + v_cvt_f16_f32_e32 v73, v149 + v_cvt_f16_f32_e32 v75, v150 + v_cvt_f16_f32_e32 v93, v151 + v_cvt_f16_f32_e32 v99, v163 + v_cvt_f16_f32_e32 v128, v180 + v_cvt_f16_f32_e32 v130, v181 + v_cvt_f16_f32_e32 v133, v182 + v_cvt_f16_f32_e32 v134, v191 + v_cvt_f16_f32_e32 v135, v192 + v_cvt_f16_f32_e32 v149, v193 + v_pack_b32_f16 v65, v65, v72 + v_pack_b32_f16 v64, v64, v71 + v_add_lshl_u32 v71, v94, v102, 1 + v_or_b32_e32 v72, 0xa0, v108 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_pack_b32_f16 v65, v93, v95 + v_pack_b32_f16 v64, v73, v75 + v_add_lshl_u32 v71, v94, v72, 1 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_pack_b32_f16 v65, v128, v130 + v_pack_b32_f16 v64, v98, v99 + v_add_lshl_u32 v71, v89, v102, 1 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_pack_b32_f16 v65, v135, v149 + v_pack_b32_f16 v64, v133, v134 + v_add_lshl_u32 v71, v89, v72, 1 + v_mov_b32_e32 v102, v72 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_cvt_f16_f32_e32 v65, v157 + v_cvt_f16_f32_e32 v72, v168 + v_cvt_f16_f32_e32 v64, v155 + v_cvt_f16_f32_e32 v71, v156 + v_cvt_f16_f32_e32 v73, v169 + v_cvt_f16_f32_e32 v75, v170 + v_cvt_f16_f32_e32 v93, v186 + v_cvt_f16_f32_e32 v95, v187 + v_cvt_f16_f32_e32 v98, v188 + v_cvt_f16_f32_e32 v99, v197 + v_cvt_f16_f32_e32 v128, v198 + v_cvt_f16_f32_e32 v130, v199 + v_cvt_f16_f32_e32 v133, v209 + v_cvt_f16_f32_e32 v134, v210 + v_cvt_f16_f32_e32 v135, v211 + v_cvt_f16_f32_e32 v149, v212 + v_pack_b32_f16 v65, v65, v72 + v_or_b32_e32 v72, 0xc0, v108 + v_pack_b32_f16 v64, v64, v71 + v_add_lshl_u32 v71, v94, v72, 1 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_pack_b32_f16 v65, v93, v95 + v_pack_b32_f16 v64, v73, v75 + v_add_lshl_u32 v71, v94, v251, 1 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_pack_b32_f16 v65, v128, v130 + v_pack_b32_f16 v64, v98, v99 + v_add_lshl_u32 v71, v89, v72, 1 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_pack_b32_f16 v65, v135, v149 + v_pack_b32_f16 v64, v133, v134 + v_add_lshl_u32 v71, v89, v251, 1 + buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen + v_cvt_f16_f32_e32 v64, v33 + v_cvt_f16_f32_e32 v33, v34 + v_cvt_f16_f32_e32 v34, v35 + v_cvt_f16_f32_e32 v35, v36 + v_cvt_f16_f32_e32 v36, v38 + v_cvt_f16_f32_e32 v38, v39 + v_cvt_f16_f32_e32 v39, v40 + v_cvt_f16_f32_e32 v40, v44 + v_cvt_f16_f32_e32 v44, v45 + v_cvt_f16_f32_e32 v45, v46 + v_cvt_f16_f32_e32 v46, v49 + v_add_u32_e32 v77, s4, v89 + v_cvt_f16_f32_e32 v49, v50 + v_cvt_f16_f32_e32 v50, v51 + v_cvt_f16_f32_e32 v51, v52 + v_cvt_f16_f32_e32 v52, v53 + v_pack_b32_f16 v33, v33, v34 + v_pack_b32_f16 v32, v32, v64 + v_add_lshl_u32 v34, v77, v108, 1 + v_add_u32_e32 v90, s4, v77 + ; sched_barrier mask(0x00000406) + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v35, v36 + v_add_lshl_u32 v34, v77, v109, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v45, v46 + v_pack_b32_f16 v32, v40, v44 + v_add_lshl_u32 v34, v90, v108, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v51, v52 + v_pack_b32_f16 v32, v49, v50 + v_add_lshl_u32 v34, v90, v109, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v37 + v_cvt_f16_f32_e32 v34, v41 + v_cvt_f16_f32_e32 v33, v42 + v_cvt_f16_f32_e32 v35, v43 + v_cvt_f16_f32_e32 v36, v47 + v_cvt_f16_f32_e32 v37, v48 + v_cvt_f16_f32_e32 v38, v54 + v_cvt_f16_f32_e32 v39, v55 + v_cvt_f16_f32_e32 v40, v58 + v_cvt_f16_f32_e32 v41, v59 + v_cvt_f16_f32_e32 v42, v60 + v_cvt_f16_f32_e32 v43, v61 + v_cvt_f16_f32_e32 v44, v62 + v_cvt_f16_f32_e32 v45, v63 + v_cvt_f16_f32_e32 v46, v68 + v_cvt_f16_f32_e32 v47, v69 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v77, v107, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v77, v106, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v90, v107, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v90, v106, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v96 + v_cvt_f16_f32_e32 v34, v97 + v_cvt_f16_f32_e32 v33, v100 + v_cvt_f16_f32_e32 v35, v101 + v_cvt_f16_f32_e32 v36, v67 + v_cvt_f16_f32_e32 v37, v110 + v_cvt_f16_f32_e32 v38, v140 + v_cvt_f16_f32_e32 v39, v141 + v_cvt_f16_f32_e32 v40, v142 + v_cvt_f16_f32_e32 v41, v143 + v_cvt_f16_f32_e32 v42, v144 + v_cvt_f16_f32_e32 v43, v145 + v_cvt_f16_f32_e32 v44, v152 + v_cvt_f16_f32_e32 v45, v153 + v_cvt_f16_f32_e32 v46, v154 + v_cvt_f16_f32_e32 v47, v164 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v77, v103, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v77, v102, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v90, v103, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v90, v102, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v123 + v_cvt_f16_f32_e32 v34, v124 + v_cvt_f16_f32_e32 v33, v129 + v_cvt_f16_f32_e32 v35, v131 + v_cvt_f16_f32_e32 v36, v132 + v_cvt_f16_f32_e32 v37, v137 + v_cvt_f16_f32_e32 v38, v138 + v_cvt_f16_f32_e32 v39, v139 + v_cvt_f16_f32_e32 v40, v158 + v_cvt_f16_f32_e32 v41, v159 + v_cvt_f16_f32_e32 v42, v160 + v_cvt_f16_f32_e32 v43, v171 + v_cvt_f16_f32_e32 v44, v172 + v_cvt_f16_f32_e32 v45, v173 + v_cvt_f16_f32_e32 v46, v189 + v_cvt_f16_f32_e32 v47, v190 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v77, v72, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v77, v251, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v90, v72, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v90, v251, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v136 + v_cvt_f16_f32_e32 v34, v146 + v_cvt_f16_f32_e32 v33, v147 + v_cvt_f16_f32_e32 v35, v148 + v_cvt_f16_f32_e32 v36, v165 + v_cvt_f16_f32_e32 v37, v166 + v_cvt_f16_f32_e32 v38, v167 + v_cvt_f16_f32_e32 v39, v177 + v_cvt_f16_f32_e32 v40, v178 + v_cvt_f16_f32_e32 v41, v179 + v_cvt_f16_f32_e32 v42, v194 + v_cvt_f16_f32_e32 v43, v195 + v_add_u32_e32 v92, s4, v90 + v_cvt_f16_f32_e32 v44, v196 + v_cvt_f16_f32_e32 v45, v203 + v_cvt_f16_f32_e32 v46, v204 + v_cvt_f16_f32_e32 v47, v205 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v92, v108, 1 + v_add_u32_e32 v87, s4, v92 + ; sched_barrier mask(0x00000406) + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v92, v109, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v87, v108, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v87, v109, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v174 + v_cvt_f16_f32_e32 v34, v175 + v_cvt_f16_f32_e32 v33, v176 + v_cvt_f16_f32_e32 v35, v183 + v_cvt_f16_f32_e32 v36, v184 + v_cvt_f16_f32_e32 v37, v185 + v_cvt_f16_f32_e32 v38, v200 + v_cvt_f16_f32_e32 v39, v201 + v_cvt_f16_f32_e32 v40, v202 + v_cvt_f16_f32_e32 v41, v206 + v_cvt_f16_f32_e32 v42, v207 + v_cvt_f16_f32_e32 v43, v208 + v_cvt_f16_f32_e32 v44, v213 + v_cvt_f16_f32_e32 v45, v214 + v_cvt_f16_f32_e32 v46, v215 + v_cvt_f16_f32_e32 v47, v216 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v92, v107, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v92, v106, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v87, v107, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v87, v106, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v219 + v_cvt_f16_f32_e32 v34, v220 + v_cvt_f16_f32_e32 v33, v221 + v_cvt_f16_f32_e32 v35, v222 + v_cvt_f16_f32_e32 v36, v223 + v_cvt_f16_f32_e32 v37, v224 + v_cvt_f16_f32_e32 v38, v227 + v_cvt_f16_f32_e32 v39, v228 + v_cvt_f16_f32_e32 v40, v229 + v_cvt_f16_f32_e32 v41, v233 + v_cvt_f16_f32_e32 v42, v234 + v_cvt_f16_f32_e32 v43, v235 + v_cvt_f16_f32_e32 v44, v239 + v_cvt_f16_f32_e32 v45, v240 + v_cvt_f16_f32_e32 v46, v241 + v_cvt_f16_f32_e32 v47, v245 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v92, v103, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v92, v102, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v87, v103, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v87, v102, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v225 + v_cvt_f16_f32_e32 v34, v226 + v_cvt_f16_f32_e32 v33, v230 + v_cvt_f16_f32_e32 v35, v231 + v_cvt_f16_f32_e32 v36, v232 + v_cvt_f16_f32_e32 v37, v236 + v_cvt_f16_f32_e32 v38, v237 + v_cvt_f16_f32_e32 v39, v238 + v_cvt_f16_f32_e32 v40, v242 + v_cvt_f16_f32_e32 v41, v243 + v_cvt_f16_f32_e32 v42, v244 + v_cvt_f16_f32_e32 v43, v246 + v_cvt_f16_f32_e32 v44, v247 + v_cvt_f16_f32_e32 v45, v248 + v_cvt_f16_f32_e32 v46, v249 + v_cvt_f16_f32_e32 v47, v250 + v_pack_b32_f16 v33, v33, v35 + v_pack_b32_f16 v32, v32, v34 + v_add_lshl_u32 v34, v92, v72, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v38, v39 + v_pack_b32_f16 v32, v36, v37 + v_add_lshl_u32 v34, v92, v251, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v42, v43 + v_pack_b32_f16 v32, v40, v41 + v_add_lshl_u32 v34, v87, v72, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_pack_b32_f16 v33, v46, v47 + v_pack_b32_f16 v32, v44, v45 + v_add_lshl_u32 v34, v87, v251, 1 + buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen + v_cvt_f16_f32_e32 v32, v17 + v_cvt_f16_f32_e32 v17, v18 + v_cvt_f16_f32_e32 v18, v19 + v_cvt_f16_f32_e32 v19, v20 + v_cvt_f16_f32_e32 v20, v22 + v_cvt_f16_f32_e32 v22, v23 + v_cvt_f16_f32_e32 v23, v24 + v_cvt_f16_f32_e32 v24, v28 + v_cvt_f16_f32_e32 v28, v29 + v_cvt_f16_f32_e32 v29, v30 + v_cvt_f16_f32_e32 v30, v112 + v_add_u32_e32 v57, s4, v87 + v_cvt_f16_f32_e32 v33, v113 + v_cvt_f16_f32_e32 v34, v114 + v_cvt_f16_f32_e32 v35, v118 + v_cvt_f16_f32_e32 v36, v119 + v_pack_b32_f16 v17, v17, v18 + v_pack_b32_f16 v16, v16, v32 + v_add_lshl_u32 v18, v57, v108, 1 + v_add_u32_e32 v56, s4, v57 + ; sched_barrier mask(0x00000406) + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_pack_b32_f16 v17, v22, v23 + v_pack_b32_f16 v16, v19, v20 + v_add_lshl_u32 v18, v57, v109, 1 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_pack_b32_f16 v17, v29, v30 + v_pack_b32_f16 v16, v24, v28 + v_add_lshl_u32 v18, v56, v108, 1 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_pack_b32_f16 v17, v35, v36 + v_pack_b32_f16 v16, v33, v34 + v_add_lshl_u32 v18, v56, v109, 1 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_cvt_f16_f32_e32 v16, v21 + v_cvt_f16_f32_e32 v18, v25 + v_cvt_f16_f32_e32 v17, v26 + v_cvt_f16_f32_e32 v19, v27 + v_cvt_f16_f32_e32 v20, v31 + v_cvt_f16_f32_e32 v21, v66 + v_cvt_f16_f32_e32 v22, v111 + v_cvt_f16_f32_e32 v23, v115 + v_cvt_f16_f32_e32 v24, v116 + v_cvt_f16_f32_e32 v25, v117 + v_cvt_f16_f32_e32 v26, v120 + v_cvt_f16_f32_e32 v27, v121 + v_cvt_f16_f32_e32 v28, v122 + v_cvt_f16_f32_e32 v29, v125 + v_cvt_f16_f32_e32 v30, v126 + v_cvt_f16_f32_e32 v31, v127 + v_pack_b32_f16 v17, v17, v19 + v_pack_b32_f16 v16, v16, v18 + v_add_lshl_u32 v18, v57, v107, 1 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_pack_b32_f16 v17, v22, v23 + v_pack_b32_f16 v16, v20, v21 + v_add_lshl_u32 v18, v57, v106, 1 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_pack_b32_f16 v17, v26, v27 + v_pack_b32_f16 v16, v24, v25 + v_add_lshl_u32 v18, v56, v107, 1 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_pack_b32_f16 v17, v30, v31 + v_pack_b32_f16 v16, v28, v29 + v_add_lshl_u32 v18, v56, v106, 1 + v_accvgpr_read_b32 v0, a20 + v_accvgpr_read_b32 v7, a21 + v_accvgpr_read_b32 v88, a22 + v_accvgpr_read_b32 v91, a23 + buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen + v_cvt_f16_f32_e32 v16, v217 + v_cvt_f16_f32_e32 v17, v218 + v_cvt_f16_f32_e32 v18, v4 + v_cvt_f16_f32_e32 v19, v5 + v_cvt_f16_f32_e32 v0, v0 + v_cvt_f16_f32_e32 v7, v7 + v_cvt_f16_f32_e32 v20, v88 + v_cvt_f16_f32_e32 v21, v91 + v_pack_b32_f16 v5, v10, v11 + v_pack_b32_f16 v4, v8, v9 + v_add_lshl_u32 v8, v57, v103, 1 + buffer_store_dwordx2 v[4:5], v8, s[0:3], 0 offen + v_pack_b32_f16 v5, v14, v15 + v_pack_b32_f16 v4, v12, v13 + v_add_lshl_u32 v8, v57, v102, 1 + buffer_store_dwordx2 v[4:5], v8, s[0:3], 0 offen + v_pack_b32_f16 v5, v18, v19 + v_pack_b32_f16 v4, v16, v17 + v_add_lshl_u32 v8, v56, v103, 1 + v_accvgpr_read_b32 v1, a4 + v_accvgpr_read_b32 v2, a5 + v_accvgpr_read_b32 v3, a6 + v_accvgpr_read_b32 v6, a7 + buffer_store_dwordx2 v[4:5], v8, s[0:3], 0 offen + v_pack_b32_f16 v5, v20, v21 + v_pack_b32_f16 v4, v0, v7 + v_add_lshl_u32 v0, v56, v102, 1 + v_accvgpr_read_b32 v82, a8 + v_accvgpr_read_b32 v79, a9 + v_accvgpr_read_b32 v83, a10 + v_accvgpr_read_b32 v84, a11 + buffer_store_dwordx2 v[4:5], v0, s[0:3], 0 offen + v_cvt_f16_f32_e32 v0, v1 + v_cvt_f16_f32_e32 v2, v2 + v_cvt_f16_f32_e32 v1, v3 + v_cvt_f16_f32_e32 v3, v6 + v_accvgpr_read_b32 v86, a0 + v_accvgpr_read_b32 v78, a1 + v_accvgpr_read_b32 v80, a2 + v_accvgpr_read_b32 v81, a3 + v_cvt_f16_f32_e32 v4, v82 + v_cvt_f16_f32_e32 v5, v79 + v_cvt_f16_f32_e32 v6, v83 + v_cvt_f16_f32_e32 v7, v84 + v_accvgpr_read_b32 v85, a12 + v_accvgpr_read_b32 v70, a13 + v_accvgpr_read_b32 v74, a14 + v_accvgpr_read_b32 v76, a15 + v_cvt_f16_f32_e32 v8, v86 + v_cvt_f16_f32_e32 v9, v78 + v_cvt_f16_f32_e32 v10, v80 + v_cvt_f16_f32_e32 v11, v81 + v_cvt_f16_f32_e32 v12, v85 + v_cvt_f16_f32_e32 v13, v70 + v_cvt_f16_f32_e32 v14, v74 + v_cvt_f16_f32_e32 v15, v76 + v_pack_b32_f16 v1, v1, v3 + v_pack_b32_f16 v0, v0, v2 + v_add_lshl_u32 v2, v57, v72, 1 + buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen + v_pack_b32_f16 v1, v6, v7 + v_pack_b32_f16 v0, v4, v5 + v_add_lshl_u32 v2, v57, v251, 1 + buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen + v_pack_b32_f16 v1, v10, v11 + v_pack_b32_f16 v0, v8, v9 + v_add_lshl_u32 v2, v56, v72, 1 + buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen + v_pack_b32_f16 v1, v14, v15 + v_pack_b32_f16 v0, v12, v13 + v_add_lshl_u32 v2, v56, v251, 1 + ; sched_barrier mask(0x000007F6) + buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen + s_endpgm + .section .rodata,"a",@progbits + .p2align 6, 0x0 + .amdhsa_kernel matmul_kernel + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_kernarg_size 72 + .amdhsa_user_sgpr_count 15 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_kernarg_preload_length 13 + .amdhsa_user_sgpr_kernarg_preload_offset 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_enable_private_segment 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 512 + .amdhsa_next_free_sgpr 30 + .amdhsa_accum_offset 256 + .amdhsa_reserve_vcc 1 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_dx10_clamp 1 + .amdhsa_ieee_mode 1 + .amdhsa_fp16_overflow 0 + .amdhsa_tg_split 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .end_amdhsa_kernel + .text +.Lfunc_end0: + .size matmul_kernel, .Lfunc_end0-matmul_kernel + .cfi_endproc + ; -- End function + .set matmul_kernel.num_vgpr, 256 + .set matmul_kernel.num_agpr, 256 + .set matmul_kernel.numbered_sgpr, 30 + .set matmul_kernel.private_seg_size, 0 + .set matmul_kernel.uses_vcc, 1 + .set matmul_kernel.uses_flat_scratch, 0 + .set matmul_kernel.has_dyn_sized_stack, 0 + .set matmul_kernel.has_recursion, 0 + .set matmul_kernel.has_indirect_call, 0 + .section .AMDGPU.csdata,"",@progbits +; Kernel info: +; codeLenInByte = 22432 +; TotalNumSgprs: 36 +; NumVgprs: 256 +; NumAgprs: 256 +; TotalNumVgprs: 512 +; ScratchSize: 0 +; MemoryBound: 0 +; FloatMode: 240 +; IeeeMode: 1 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 4 +; VGPRBlocks: 63 +; NumSGPRsForWavesPerEU: 36 +; NumVGPRsForWavesPerEU: 512 +; AccumOffset: 256 +; Occupancy: 1 +; WaveLimiterHint : 0 +; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 15 +; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63 +; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0 + .text + .p2alignl 6, 3212836864 + .fill 256, 4, 3212836864 + .section .AMDGPU.gpr_maximums,"",@progbits + .set amdgpu.max_num_vgpr, 0 + .set amdgpu.max_num_agpr, 0 + .set amdgpu.max_num_sgpr, 0 + .text + .section .debug_abbrev,"",@progbits + .byte 1 ; Abbreviation Code + .byte 17 ; DW_TAG_compile_unit + .byte 0 ; DW_CHILDREN_no + .byte 37 ; DW_AT_producer + .byte 14 ; DW_FORM_strp + .byte 19 ; DW_AT_language + .byte 5 ; DW_FORM_data2 + .byte 3 ; DW_AT_name + .byte 14 ; DW_FORM_strp + .byte 16 ; DW_AT_stmt_list + .byte 23 ; DW_FORM_sec_offset + .byte 17 ; DW_AT_low_pc + .byte 1 ; DW_FORM_addr + .byte 18 ; DW_AT_high_pc + .byte 6 ; DW_FORM_data4 + .byte 0 ; EOM(1) + .byte 0 ; EOM(2) + .byte 0 ; EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 ; Length of Unit +.Ldebug_info_start0: + .short 4 ; DWARF version number + .long .debug_abbrev ; Offset Into Abbrev. Section + .byte 8 ; Address Size (in bytes) + .byte 1 ; Abbrev [1] 0xb:0x1b DW_TAG_compile_unit + .long .Linfo_string0 ; DW_AT_producer + .short 2 ; DW_AT_language + .long .Linfo_string1 ; DW_AT_name + .long .Lline_table_start0 ; DW_AT_stmt_list + .quad .Lfunc_begin0 ; DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 ; DW_AT_high_pc +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "triton" ; string offset=0 +.Linfo_string1: + .asciz "" ; string offset=7 + .section ".note.GNU-stack","",@progbits + .amdgpu_metadata +--- +amdhsa.kernels: + - .agpr_count: 256 + .args: + - .address_space: global + .offset: 0 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 8 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 16 + .size: 8 + .value_kind: global_buffer + - .address_space: global + .offset: 24 + .size: 8 + .value_kind: global_buffer + - .offset: 32 + .size: 4 + .value_kind: by_value + - .offset: 36 + .size: 4 + .value_kind: by_value + - .offset: 40 + .size: 4 + .value_kind: by_value + - .offset: 44 + .size: 4 + .value_kind: by_value + - .offset: 48 + .size: 4 + .value_kind: by_value + - .offset: 52 + .size: 4 + .value_kind: by_value + - .offset: 56 + .size: 4 + .value_kind: by_value + - .address_space: global + .offset: 64 + .size: 8 + .value_kind: global_buffer + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 72 + .max_flat_workgroup_size: 256 + .name: matmul_kernel + .private_segment_fixed_size: 0 + .sgpr_count: 36 + .sgpr_spill_count: 0 + .symbol: matmul_kernel.kd + .vgpr_count: 512 + .vgpr_spill_count: 0 + .wavefront_size: 64 +amdhsa.target: amdgcn-amd-amdhsa--gfx942 +amdhsa.version: + - 1 + - 1 +... + + .end_amdgpu_metadata + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir new file mode 100644 index 0000000000000..fb167d35c1c61 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir @@ -0,0 +1,5722 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nofree norecurse nounwind +define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg nocapture readonly %0, ptr addrspace(1) inreg nocapture readonly %1, ptr addrspace(1) inreg nocapture writeonly %2, ptr addrspace(1) inreg nocapture readnone %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, ptr addrspace(1) inreg nocapture readnone %11) local_unnamed_addr #0 !dbg !4 { + %13 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %14 = sdiv i32 %13, 8 + %15 = mul i32 %13, 76 + %16 = mul i32 %14, -607 + %17 = add i32 %16, %15 + %18 = add i32 %5, 255 + %19 = sdiv i32 %18, 256 + %20 = shl nsw i32 %19, 2 + %.frozen = freeze i32 %20 + %21 = sdiv i32 %17, %.frozen + %22 = shl nsw i32 %21, 2 + %23 = mul i32 %21, %.frozen + %.decomposed = sub i32 %17, %23 + %24 = add i32 %4, 255 + %25 = sdiv i32 %24, 256 + %26 = sub nsw i32 %25, %22 + %27 = tail call i32 @llvm.smin.i32(i32 %26, i32 4) + %.decomposed.frozen = freeze i32 %.decomposed + %.frozen2426 = freeze i32 %27 + %28 = sdiv i32 %.decomposed.frozen, %.frozen2426 + %29 = mul i32 %28, %.frozen2426 + %.decomposed2427 = sub i32 %.decomposed.frozen, %29 + %30 = add nsw i32 %.decomposed2427, %22 + %31 = shl i32 %30, 8 + %32 = tail call i32 @llvm.amdgcn.workitem.id.x() + %33 = lshr i32 %32, 3 + %34 = and i32 %33, 16 + %35 = and i32 %33, 31 + %36 = or disjoint i32 %35, 32 + %37 = or disjoint i32 %35, 64 + %38 = or disjoint i32 %35, 96 + %39 = or disjoint i32 %35, 128 + %40 = or disjoint i32 %35, 160 + %41 = or disjoint i32 %35, 192 + %42 = or disjoint i32 %35, 224 + %43 = mul i32 %31, %7 + %44 = mul i32 %7, %35 + %45 = mul i32 %7, %36 + %46 = mul i32 %7, %37 + %47 = mul i32 %7, %38 + %48 = mul i32 %7, %39 + %49 = mul i32 %7, %40 + %50 = mul i32 %7, %41 + %51 = mul i32 %7, %42 + %52 = sext i32 %43 to i64 + %53 = getelementptr half, ptr addrspace(1) %0, i64 %52 + %54 = shl i32 %32, 3 + %55 = and i32 %54, 56 + %56 = add i32 %44, %55 + %57 = add i32 %45, %55 + %58 = add i32 %46, %55 + %59 = add i32 %47, %55 + %60 = add i32 %48, %55 + %61 = add i32 %49, %55 + %62 = add i32 %50, %55 + %63 = add i32 %51, %55 + %64 = getelementptr i8, ptr addrspace(1) %53, i64 128 + %65 = add i32 %6, 63 + %66 = icmp sgt i32 %65, 63 + %67 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %53, i16 0, i32 2147483646, i32 159744) + %68 = shl i32 %56, 1 + %69 = select i1 %66, i32 %68, i32 -2147483648 + %70 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %69, i32 0, i32 0) + %71 = shl i32 %57, 1 + %72 = select i1 %66, i32 %71, i32 -2147483648 + %73 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %72, i32 0, i32 0) + %74 = shl i32 %58, 1 + %75 = select i1 %66, i32 %74, i32 -2147483648 + %76 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %75, i32 0, i32 0) + %77 = shl i32 %59, 1 + %78 = select i1 %66, i32 %77, i32 -2147483648 + %79 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %78, i32 0, i32 0) + %80 = shl i32 %60, 1 + %81 = select i1 %66, i32 %80, i32 -2147483648 + %82 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %81, i32 0, i32 0) + %83 = shl i32 %61, 1 + %84 = select i1 %66, i32 %83, i32 -2147483648 + %85 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %84, i32 0, i32 0) + %86 = shl i32 %62, 1 + %87 = select i1 %66, i32 %86, i32 -2147483648 + %88 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %87, i32 0, i32 0) + %89 = shl i32 %63, 1 + %90 = select i1 %66, i32 %89, i32 -2147483648 + %91 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %90, i32 0, i32 0) + %92 = icmp sgt i32 %65, 127 + %93 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %64, i16 0, i32 2147483646, i32 159744) + %94 = select i1 %92, i32 %68, i32 -2147483648 + %95 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %94, i32 0, i32 0) + %96 = bitcast <4 x i32> %95 to <8 x half> + %97 = select i1 %92, i32 %71, i32 -2147483648 + %98 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %97, i32 0, i32 0) + %99 = bitcast <4 x i32> %98 to <8 x half> + %100 = select i1 %92, i32 %74, i32 -2147483648 + %101 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %100, i32 0, i32 0) + %102 = bitcast <4 x i32> %101 to <8 x half> + %103 = select i1 %92, i32 %77, i32 -2147483648 + %104 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %103, i32 0, i32 0) + %105 = bitcast <4 x i32> %104 to <8 x half> + %106 = select i1 %92, i32 %80, i32 -2147483648 + %107 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %106, i32 0, i32 0) + %108 = bitcast <4 x i32> %107 to <8 x half> + %109 = select i1 %92, i32 %83, i32 -2147483648 + %110 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %109, i32 0, i32 0) + %111 = bitcast <4 x i32> %110 to <8 x half> + %112 = select i1 %92, i32 %86, i32 -2147483648 + %113 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %112, i32 0, i32 0) + %114 = bitcast <4 x i32> %113 to <8 x half> + %115 = select i1 %92, i32 %89, i32 -2147483648 + %116 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %115, i32 0, i32 0) + %117 = bitcast <4 x i32> %116 to <8 x half> + %118 = shl i32 %28, 8 + %119 = mul i32 %118, %8 + %120 = mul i32 %8, %35 + %121 = mul i32 %8, %36 + %122 = mul i32 %8, %37 + %123 = mul i32 %8, %38 + %124 = mul i32 %8, %39 + %125 = mul i32 %8, %40 + %126 = mul i32 %8, %41 + %127 = mul i32 %8, %42 + %128 = sext i32 %119 to i64 + %129 = getelementptr half, ptr addrspace(1) %1, i64 %128 + %130 = add i32 %120, %55 + %131 = add i32 %121, %55 + %132 = add i32 %122, %55 + %133 = add i32 %123, %55 + %134 = add i32 %124, %55 + %135 = add i32 %125, %55 + %136 = add i32 %126, %55 + %137 = add i32 %127, %55 + %138 = getelementptr i8, ptr addrspace(1) %129, i64 128 + %139 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %129, i16 0, i32 2147483646, i32 159744) + %140 = shl i32 %130, 1 + %141 = select i1 %66, i32 %140, i32 -2147483648 + %142 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %141, i32 0, i32 0) + %143 = shl i32 %131, 1 + %144 = select i1 %66, i32 %143, i32 -2147483648 + %145 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %144, i32 0, i32 0) + %146 = shl i32 %132, 1 + %147 = select i1 %66, i32 %146, i32 -2147483648 + %148 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %147, i32 0, i32 0) + %149 = shl i32 %133, 1 + %150 = select i1 %66, i32 %149, i32 -2147483648 + %151 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %150, i32 0, i32 0) + %152 = shl i32 %134, 1 + %153 = select i1 %66, i32 %152, i32 -2147483648 + %154 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %153, i32 0, i32 0) + %155 = shl i32 %135, 1 + %156 = select i1 %66, i32 %155, i32 -2147483648 + %157 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %156, i32 0, i32 0) + %158 = shl i32 %136, 1 + %159 = select i1 %66, i32 %158, i32 -2147483648 + %160 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %159, i32 0, i32 0) + %161 = shl i32 %137, 1 + %162 = select i1 %66, i32 %161, i32 -2147483648 + %163 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %162, i32 0, i32 0) + %164 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %138, i16 0, i32 2147483646, i32 159744) + %165 = select i1 %92, i32 %140, i32 -2147483648 + %166 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %165, i32 0, i32 0) + %167 = bitcast <4 x i32> %166 to <8 x half> + %168 = select i1 %92, i32 %143, i32 -2147483648 + %169 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %168, i32 0, i32 0) + %170 = bitcast <4 x i32> %169 to <8 x half> + %171 = select i1 %92, i32 %146, i32 -2147483648 + %172 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %171, i32 0, i32 0) + %173 = bitcast <4 x i32> %172 to <8 x half> + %174 = select i1 %92, i32 %149, i32 -2147483648 + %175 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %174, i32 0, i32 0) + %176 = bitcast <4 x i32> %175 to <8 x half> + %177 = select i1 %92, i32 %152, i32 -2147483648 + %178 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %177, i32 0, i32 0) + %179 = bitcast <4 x i32> %178 to <8 x half> + %180 = select i1 %92, i32 %155, i32 -2147483648 + %181 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %180, i32 0, i32 0) + %182 = bitcast <4 x i32> %181 to <8 x half> + %183 = select i1 %92, i32 %158, i32 -2147483648 + %184 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %183, i32 0, i32 0) + %185 = bitcast <4 x i32> %184 to <8 x half> + %186 = select i1 %92, i32 %161, i32 -2147483648 + %187 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %186, i32 0, i32 0) + %188 = bitcast <4 x i32> %187 to <8 x half> + %189 = icmp sgt i32 %7, 0 + tail call void @llvm.assume(i1 %189) + %190 = icmp sgt i32 %8, 0 + tail call void @llvm.assume(i1 %190) + %191 = icmp sgt i32 %9, 0 + tail call void @llvm.assume(i1 %191) + %192 = icmp sgt i32 %10, 0 + tail call void @llvm.assume(i1 %192) + %193 = icmp sgt i32 %30, 0 + tail call void @llvm.assume(i1 %193) + %194 = icmp sgt i32 %28, 0 + tail call void @llvm.assume(i1 %194) + %195 = xor i32 %54, %32 + %196 = and i32 %195, 56 + %197 = shl nuw nsw i32 %35, 6 + %198 = or disjoint i32 %197, %196 + %199 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %198 + store <4 x i32> %70, ptr addrspace(3) %199, align 16 + %200 = or disjoint i32 %198, 2048 + %201 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %200 + store <4 x i32> %73, ptr addrspace(3) %201, align 16 + %202 = or disjoint i32 %198, 4096 + %203 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %202 + store <4 x i32> %76, ptr addrspace(3) %203, align 16 + %204 = or disjoint i32 %198, 6144 + %205 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %204 + store <4 x i32> %79, ptr addrspace(3) %205, align 16 + %206 = or disjoint i32 %198, 8192 + %207 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %206 + store <4 x i32> %82, ptr addrspace(3) %207, align 16 + %208 = or disjoint i32 %198, 10240 + %209 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %208 + store <4 x i32> %85, ptr addrspace(3) %209, align 16 + %210 = or disjoint i32 %198, 12288 + %211 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %210 + store <4 x i32> %88, ptr addrspace(3) %211, align 16 + %212 = or disjoint i32 %198, 14336 + %213 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %212 + store <4 x i32> %91, ptr addrspace(3) %213, align 16 + %214 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %198 + store <4 x i32> %142, ptr addrspace(3) %214, align 16 + %215 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %200 + store <4 x i32> %145, ptr addrspace(3) %215, align 16 + %216 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %202 + store <4 x i32> %148, ptr addrspace(3) %216, align 16 + %217 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %204 + store <4 x i32> %151, ptr addrspace(3) %217, align 16 + %218 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %206 + store <4 x i32> %154, ptr addrspace(3) %218, align 16 + %219 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %208 + store <4 x i32> %157, ptr addrspace(3) %219, align 16 + %220 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %210 + store <4 x i32> %160, ptr addrspace(3) %220, align 16 + %221 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %212 + store <4 x i32> %163, ptr addrspace(3) %221, align 16 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %222 = and i32 %32, 15 + %223 = lshr i32 %32, 4 + %224 = and i32 %223, 3 + %225 = or disjoint i32 %34, %222 + %226 = and i32 %32, 7 + %227 = xor i32 %224, %226 + %228 = shl nuw nsw i32 %227, 3 + %229 = shl nuw nsw i32 %225, 6 + %230 = or disjoint i32 %229, %228 + %231 = or disjoint i32 %229, 2048 + %232 = or disjoint i32 %231, %228 + %233 = getelementptr half, ptr addrspace(3) @global_smem, i32 %230 + %234 = load <8 x half>, ptr addrspace(3) %233, align 16 + %235 = getelementptr half, ptr addrspace(3) @global_smem, i32 %232 + %236 = load <8 x half>, ptr addrspace(3) %235, align 16 + %237 = lshr i32 %32, 2 + %238 = and i32 %237, 16 + %239 = or disjoint i32 %238, %222 + %240 = shl nuw nsw i32 %239, 6 + %241 = or disjoint i32 %228, %240 + %242 = or disjoint i32 %241, 2048 + %243 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %241 + %244 = load <8 x half>, ptr addrspace(3) %243, align 16 + %245 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %242 + %246 = load <8 x half>, ptr addrspace(3) %245, align 16 + %247 = or disjoint i32 %229, 4096 + %248 = or disjoint i32 %247, %228 + %249 = or disjoint i32 %229, 6144 + %250 = or disjoint i32 %249, %228 + %251 = getelementptr half, ptr addrspace(3) @global_smem, i32 %248 + %252 = load <8 x half>, ptr addrspace(3) %251, align 16 + %253 = getelementptr half, ptr addrspace(3) @global_smem, i32 %250 + %254 = load <8 x half>, ptr addrspace(3) %253, align 16 + %255 = or disjoint i32 %240, 4096 + %256 = or disjoint i32 %255, %228 + %257 = or disjoint i32 %256, 2048 + %258 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %256 + %259 = load <8 x half>, ptr addrspace(3) %258, align 16 + %260 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %257 + %261 = load <8 x half>, ptr addrspace(3) %260, align 16 + %262 = icmp sgt i32 %65, 191 + br i1 %262, label %.lr.ph, label %.._crit_edge_crit_edge + +.._crit_edge_crit_edge: ; preds = %12 + %.pre = or disjoint i32 %240, 8192 + %.pre1013 = or disjoint i32 %.pre, %228 + %.pre1015 = or disjoint i32 %240, 12288 + %.pre1017 = or disjoint i32 %.pre1015, %228 + %.pre1019 = or disjoint i32 %229, 8192 + %.pre1021 = or disjoint i32 %.pre1019, %228 + %.pre1023 = or disjoint i32 %229, 10240 + %.pre1025 = or disjoint i32 %.pre1023, %228 + %.pre1027 = or disjoint i32 %229, 12288 + %.pre1029 = or disjoint i32 %.pre1027, %228 + %.pre1031 = or disjoint i32 %229, 14336 + %.pre1033 = or disjoint i32 %.pre1031, %228 + %.pre1035 = or disjoint i32 %224, 4 + %.pre1037 = xor i32 %.pre1035, %226 + %.pre1039 = shl nuw nsw i32 %.pre1037, 3 + %.pre1041 = or disjoint i32 %.pre1039, %240 + %.pre1043 = or disjoint i32 %.pre1039, %229 + %.pre1045 = or disjoint i32 %231, %.pre1039 + %.pre1047 = or disjoint i32 %.pre1039, %255 + %.pre1049 = or disjoint i32 %247, %.pre1039 + %.pre1051 = or disjoint i32 %249, %.pre1039 + %.pre1053 = or disjoint i32 %.pre1039, %.pre + %.pre1055 = or disjoint i32 %.pre1053, 2048 + %.pre1057 = or disjoint i32 %.pre1039, %.pre1015 + %.pre1059 = or disjoint i32 %.pre1057, 2048 + %.pre1061 = or disjoint i32 %.pre1019, %.pre1039 + %.pre1063 = or disjoint i32 %.pre1023, %.pre1039 + %.pre1065 = or disjoint i32 %.pre1027, %.pre1039 + %.pre1067 = or disjoint i32 %.pre1031, %.pre1039 + %263 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %264 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %265 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %266 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %267 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %268 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %269 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %270 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %271 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %272 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %273 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %274 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %275 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %276 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %277 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %278 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %279 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %280 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %281 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %282 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %283 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %284 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %285 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %286 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %287 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %288 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %289 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %290 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %291 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %292 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %293 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %294 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %295 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %296 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %297 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %298 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %299 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %300 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %301 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %302 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %303 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %304 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %305 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %306 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %307 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %308 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %309 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %310 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %311 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %312 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %313 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %314 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %315 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %316 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %317 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %318 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %319 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %320 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %321 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %322 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %323 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %324 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %325 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %326 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %327 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %328 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %329 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %330 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %331 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %332 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %333 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %334 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %335 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %336 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %337 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %338 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %339 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %340 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %341 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %342 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %343 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %344 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %345 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %346 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %347 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %348 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %349 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %350 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %351 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %352 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %353 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %354 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %355 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %356 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %357 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %358 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + br label %._crit_edge + +.lr.ph: ; preds = %12 + %359 = lshr i32 %65, 6 + %invariant.op404 = or disjoint i32 %240, 6144 + %invariant.op402 = or disjoint i32 %240, 2048 + %invariant.op400 = or disjoint i32 %228, 2048 + %360 = or disjoint i32 %240, 8192 + %361 = or disjoint i32 %360, %228 + %.reass = or disjoint i32 %360, %invariant.op400 + %362 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %361 + %363 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass + %364 = or disjoint i32 %240, 12288 + %365 = or disjoint i32 %364, %228 + %.reass401 = or disjoint i32 %364, %invariant.op400 + %366 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %365 + %367 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass401 + %368 = or disjoint i32 %229, 8192 + %369 = or disjoint i32 %368, %228 + %370 = or disjoint i32 %229, 10240 + %371 = or disjoint i32 %370, %228 + %372 = getelementptr half, ptr addrspace(3) @global_smem, i32 %369 + %373 = getelementptr half, ptr addrspace(3) @global_smem, i32 %371 + %374 = or disjoint i32 %229, 12288 + %375 = or disjoint i32 %374, %228 + %376 = or disjoint i32 %229, 14336 + %377 = or disjoint i32 %376, %228 + %378 = getelementptr half, ptr addrspace(3) @global_smem, i32 %375 + %379 = getelementptr half, ptr addrspace(3) @global_smem, i32 %377 + %380 = or disjoint i32 %224, 4 + %381 = xor i32 %380, %226 + %382 = shl nuw nsw i32 %381, 3 + %383 = or disjoint i32 %382, %229 + %384 = or disjoint i32 %231, %382 + %385 = getelementptr half, ptr addrspace(3) @global_smem, i32 %383 + %386 = getelementptr half, ptr addrspace(3) @global_smem, i32 %384 + %387 = or disjoint i32 %382, %240 + %.reass403 = or disjoint i32 %382, %invariant.op402 + %388 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %387 + %389 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass403 + %390 = or disjoint i32 %382, %255 + %.reass405 = or disjoint i32 %382, %invariant.op404 + %391 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %390 + %392 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass405 + %393 = or disjoint i32 %247, %382 + %394 = or disjoint i32 %249, %382 + %395 = getelementptr half, ptr addrspace(3) @global_smem, i32 %393 + %396 = getelementptr half, ptr addrspace(3) @global_smem, i32 %394 + %397 = or disjoint i32 %382, %360 + %398 = or disjoint i32 %397, 2048 + %399 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %397 + %400 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %398 + %401 = or disjoint i32 %382, %364 + %402 = or disjoint i32 %401, 2048 + %403 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %401 + %404 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %402 + %405 = or disjoint i32 %368, %382 + %406 = or disjoint i32 %370, %382 + %407 = getelementptr half, ptr addrspace(3) @global_smem, i32 %405 + %408 = getelementptr half, ptr addrspace(3) @global_smem, i32 %406 + %409 = or disjoint i32 %374, %382 + %410 = or disjoint i32 %376, %382 + %411 = getelementptr half, ptr addrspace(3) @global_smem, i32 %409 + %412 = getelementptr half, ptr addrspace(3) @global_smem, i32 %410 + %413 = add nsw i32 %359, -3 + %414 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %415 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %416 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %417 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %418 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %419 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %420 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %421 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %422 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %423 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %424 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %425 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %426 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %427 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %428 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %429 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %430 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %431 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %432 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %433 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %434 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %435 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %436 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %437 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %438 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %439 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %440 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %441 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %442 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %443 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %444 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %445 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %446 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %447 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %448 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %449 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %450 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %451 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %452 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %453 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %454 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %455 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %456 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %457 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %458 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %459 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %460 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %461 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %462 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %463 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %464 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %465 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %466 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %467 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %468 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %469 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %470 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %471 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %472 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %473 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %474 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %475 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %476 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %477 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %478 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %479 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %480 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %481 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %482 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %483 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %484 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %485 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %486 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %487 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %488 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %489 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %490 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %491 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %492 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %493 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %494 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %495 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %496 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %497 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %498 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %499 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %500 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %501 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %502 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %503 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %504 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %505 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %506 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %507 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %508 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %509 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + br label %510 + +510: ; preds = %.lr.ph, %510 + %511 = phi float [ 0.000000e+00, %.lr.ph ], [ %1824, %510 ] + %512 = phi float [ 0.000000e+00, %.lr.ph ], [ %1825, %510 ] + %513 = phi float [ 0.000000e+00, %.lr.ph ], [ %1826, %510 ] + %514 = phi float [ 0.000000e+00, %.lr.ph ], [ %1827, %510 ] + %515 = phi float [ 0.000000e+00, %.lr.ph ], [ %1830, %510 ] + %516 = phi float [ 0.000000e+00, %.lr.ph ], [ %1831, %510 ] + %517 = phi float [ 0.000000e+00, %.lr.ph ], [ %1832, %510 ] + %518 = phi float [ 0.000000e+00, %.lr.ph ], [ %1833, %510 ] + %519 = phi float [ 0.000000e+00, %.lr.ph ], [ %1836, %510 ] + %520 = phi float [ 0.000000e+00, %.lr.ph ], [ %1837, %510 ] + %521 = phi float [ 0.000000e+00, %.lr.ph ], [ %1838, %510 ] + %522 = phi float [ 0.000000e+00, %.lr.ph ], [ %1839, %510 ] + %523 = phi float [ 0.000000e+00, %.lr.ph ], [ %1842, %510 ] + %524 = phi float [ 0.000000e+00, %.lr.ph ], [ %1843, %510 ] + %525 = phi float [ 0.000000e+00, %.lr.ph ], [ %1844, %510 ] + %526 = phi float [ 0.000000e+00, %.lr.ph ], [ %1845, %510 ] + %527 = phi float [ 0.000000e+00, %.lr.ph ], [ %1800, %510 ] + %528 = phi float [ 0.000000e+00, %.lr.ph ], [ %1801, %510 ] + %529 = phi float [ 0.000000e+00, %.lr.ph ], [ %1802, %510 ] + %530 = phi float [ 0.000000e+00, %.lr.ph ], [ %1803, %510 ] + %531 = phi float [ 0.000000e+00, %.lr.ph ], [ %1806, %510 ] + %532 = phi float [ 0.000000e+00, %.lr.ph ], [ %1807, %510 ] + %533 = phi float [ 0.000000e+00, %.lr.ph ], [ %1808, %510 ] + %534 = phi float [ 0.000000e+00, %.lr.ph ], [ %1809, %510 ] + %535 = phi float [ 0.000000e+00, %.lr.ph ], [ %1812, %510 ] + %536 = phi float [ 0.000000e+00, %.lr.ph ], [ %1813, %510 ] + %537 = phi float [ 0.000000e+00, %.lr.ph ], [ %1814, %510 ] + %538 = phi float [ 0.000000e+00, %.lr.ph ], [ %1815, %510 ] + %539 = phi float [ 0.000000e+00, %.lr.ph ], [ %1818, %510 ] + %540 = phi float [ 0.000000e+00, %.lr.ph ], [ %1819, %510 ] + %541 = phi float [ 0.000000e+00, %.lr.ph ], [ %1820, %510 ] + %542 = phi float [ 0.000000e+00, %.lr.ph ], [ %1821, %510 ] + %543 = phi float [ 0.000000e+00, %.lr.ph ], [ %1720, %510 ] + %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %1721, %510 ] + %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %1722, %510 ] + %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %1723, %510 ] + %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %1726, %510 ] + %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %1727, %510 ] + %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %1728, %510 ] + %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %1729, %510 ] + %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %1732, %510 ] + %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %1733, %510 ] + %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %1734, %510 ] + %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %1735, %510 ] + %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %1738, %510 ] + %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %1739, %510 ] + %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %1740, %510 ] + %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %1741, %510 ] + %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %1696, %510 ] + %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %1697, %510 ] + %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %1698, %510 ] + %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %1699, %510 ] + %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %1702, %510 ] + %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %1703, %510 ] + %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %1704, %510 ] + %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %1705, %510 ] + %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %1708, %510 ] + %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %1709, %510 ] + %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %1710, %510 ] + %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %1711, %510 ] + %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %1714, %510 ] + %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %1715, %510 ] + %573 = phi float [ 0.000000e+00, %.lr.ph ], [ %1716, %510 ] + %574 = phi float [ 0.000000e+00, %.lr.ph ], [ %1717, %510 ] + %575 = phi float [ 0.000000e+00, %.lr.ph ], [ %1772, %510 ] + %576 = phi float [ 0.000000e+00, %.lr.ph ], [ %1773, %510 ] + %577 = phi float [ 0.000000e+00, %.lr.ph ], [ %1774, %510 ] + %578 = phi float [ 0.000000e+00, %.lr.ph ], [ %1775, %510 ] + %579 = phi float [ 0.000000e+00, %.lr.ph ], [ %1778, %510 ] + %580 = phi float [ 0.000000e+00, %.lr.ph ], [ %1779, %510 ] + %581 = phi float [ 0.000000e+00, %.lr.ph ], [ %1780, %510 ] + %582 = phi float [ 0.000000e+00, %.lr.ph ], [ %1781, %510 ] + %583 = phi float [ 0.000000e+00, %.lr.ph ], [ %1784, %510 ] + %584 = phi float [ 0.000000e+00, %.lr.ph ], [ %1785, %510 ] + %585 = phi float [ 0.000000e+00, %.lr.ph ], [ %1786, %510 ] + %586 = phi float [ 0.000000e+00, %.lr.ph ], [ %1787, %510 ] + %587 = phi float [ 0.000000e+00, %.lr.ph ], [ %1790, %510 ] + %588 = phi float [ 0.000000e+00, %.lr.ph ], [ %1791, %510 ] + %589 = phi float [ 0.000000e+00, %.lr.ph ], [ %1792, %510 ] + %590 = phi float [ 0.000000e+00, %.lr.ph ], [ %1793, %510 ] + %591 = phi float [ 0.000000e+00, %.lr.ph ], [ %1744, %510 ] + %592 = phi float [ 0.000000e+00, %.lr.ph ], [ %1745, %510 ] + %593 = phi float [ 0.000000e+00, %.lr.ph ], [ %1746, %510 ] + %594 = phi float [ 0.000000e+00, %.lr.ph ], [ %1747, %510 ] + %595 = phi float [ 0.000000e+00, %.lr.ph ], [ %1750, %510 ] + %596 = phi float [ 0.000000e+00, %.lr.ph ], [ %1751, %510 ] + %597 = phi float [ 0.000000e+00, %.lr.ph ], [ %1752, %510 ] + %598 = phi float [ 0.000000e+00, %.lr.ph ], [ %1753, %510 ] + %599 = phi float [ 0.000000e+00, %.lr.ph ], [ %1756, %510 ] + %600 = phi float [ 0.000000e+00, %.lr.ph ], [ %1757, %510 ] + %601 = phi float [ 0.000000e+00, %.lr.ph ], [ %1758, %510 ] + %602 = phi float [ 0.000000e+00, %.lr.ph ], [ %1759, %510 ] + %603 = phi float [ 0.000000e+00, %.lr.ph ], [ %1762, %510 ] + %604 = phi float [ 0.000000e+00, %.lr.ph ], [ %1763, %510 ] + %605 = phi float [ 0.000000e+00, %.lr.ph ], [ %1764, %510 ] + %606 = phi float [ 0.000000e+00, %.lr.ph ], [ %1765, %510 ] + %607 = phi float [ 0.000000e+00, %.lr.ph ], [ %1668, %510 ] + %608 = phi float [ 0.000000e+00, %.lr.ph ], [ %1669, %510 ] + %609 = phi float [ 0.000000e+00, %.lr.ph ], [ %1670, %510 ] + %610 = phi float [ 0.000000e+00, %.lr.ph ], [ %1671, %510 ] + %611 = phi float [ 0.000000e+00, %.lr.ph ], [ %1674, %510 ] + %612 = phi float [ 0.000000e+00, %.lr.ph ], [ %1675, %510 ] + %613 = phi float [ 0.000000e+00, %.lr.ph ], [ %1676, %510 ] + %614 = phi float [ 0.000000e+00, %.lr.ph ], [ %1677, %510 ] + %615 = phi float [ 0.000000e+00, %.lr.ph ], [ %1680, %510 ] + %616 = phi float [ 0.000000e+00, %.lr.ph ], [ %1681, %510 ] + %617 = phi float [ 0.000000e+00, %.lr.ph ], [ %1682, %510 ] + %618 = phi float [ 0.000000e+00, %.lr.ph ], [ %1683, %510 ] + %619 = phi float [ 0.000000e+00, %.lr.ph ], [ %1686, %510 ] + %620 = phi float [ 0.000000e+00, %.lr.ph ], [ %1687, %510 ] + %621 = phi float [ 0.000000e+00, %.lr.ph ], [ %1688, %510 ] + %622 = phi float [ 0.000000e+00, %.lr.ph ], [ %1689, %510 ] + %623 = phi float [ 0.000000e+00, %.lr.ph ], [ %1644, %510 ] + %624 = phi float [ 0.000000e+00, %.lr.ph ], [ %1645, %510 ] + %625 = phi float [ 0.000000e+00, %.lr.ph ], [ %1646, %510 ] + %626 = phi float [ 0.000000e+00, %.lr.ph ], [ %1647, %510 ] + %627 = phi float [ 0.000000e+00, %.lr.ph ], [ %1650, %510 ] + %628 = phi float [ 0.000000e+00, %.lr.ph ], [ %1651, %510 ] + %629 = phi float [ 0.000000e+00, %.lr.ph ], [ %1652, %510 ] + %630 = phi float [ 0.000000e+00, %.lr.ph ], [ %1653, %510 ] + %631 = phi float [ 0.000000e+00, %.lr.ph ], [ %1656, %510 ] + %632 = phi float [ 0.000000e+00, %.lr.ph ], [ %1657, %510 ] + %633 = phi float [ 0.000000e+00, %.lr.ph ], [ %1658, %510 ] + %634 = phi float [ 0.000000e+00, %.lr.ph ], [ %1659, %510 ] + %635 = phi float [ 0.000000e+00, %.lr.ph ], [ %1662, %510 ] + %636 = phi float [ 0.000000e+00, %.lr.ph ], [ %1663, %510 ] + %637 = phi float [ 0.000000e+00, %.lr.ph ], [ %1664, %510 ] + %638 = phi float [ 0.000000e+00, %.lr.ph ], [ %1665, %510 ] + %639 = phi float [ 0.000000e+00, %.lr.ph ], [ %1558, %510 ] + %640 = phi float [ 0.000000e+00, %.lr.ph ], [ %1559, %510 ] + %641 = phi float [ 0.000000e+00, %.lr.ph ], [ %1560, %510 ] + %642 = phi float [ 0.000000e+00, %.lr.ph ], [ %1561, %510 ] + %643 = phi float [ 0.000000e+00, %.lr.ph ], [ %1564, %510 ] + %644 = phi float [ 0.000000e+00, %.lr.ph ], [ %1565, %510 ] + %645 = phi float [ 0.000000e+00, %.lr.ph ], [ %1566, %510 ] + %646 = phi float [ 0.000000e+00, %.lr.ph ], [ %1567, %510 ] + %647 = phi float [ 0.000000e+00, %.lr.ph ], [ %1570, %510 ] + %648 = phi float [ 0.000000e+00, %.lr.ph ], [ %1571, %510 ] + %649 = phi float [ 0.000000e+00, %.lr.ph ], [ %1572, %510 ] + %650 = phi float [ 0.000000e+00, %.lr.ph ], [ %1573, %510 ] + %651 = phi float [ 0.000000e+00, %.lr.ph ], [ %1576, %510 ] + %652 = phi float [ 0.000000e+00, %.lr.ph ], [ %1577, %510 ] + %653 = phi float [ 0.000000e+00, %.lr.ph ], [ %1578, %510 ] + %654 = phi float [ 0.000000e+00, %.lr.ph ], [ %1579, %510 ] + %655 = phi float [ 0.000000e+00, %.lr.ph ], [ %1534, %510 ] + %656 = phi float [ 0.000000e+00, %.lr.ph ], [ %1535, %510 ] + %657 = phi float [ 0.000000e+00, %.lr.ph ], [ %1536, %510 ] + %658 = phi float [ 0.000000e+00, %.lr.ph ], [ %1537, %510 ] + %659 = phi float [ 0.000000e+00, %.lr.ph ], [ %1540, %510 ] + %660 = phi float [ 0.000000e+00, %.lr.ph ], [ %1541, %510 ] + %661 = phi float [ 0.000000e+00, %.lr.ph ], [ %1542, %510 ] + %662 = phi float [ 0.000000e+00, %.lr.ph ], [ %1543, %510 ] + %663 = phi float [ 0.000000e+00, %.lr.ph ], [ %1546, %510 ] + %664 = phi float [ 0.000000e+00, %.lr.ph ], [ %1547, %510 ] + %665 = phi float [ 0.000000e+00, %.lr.ph ], [ %1548, %510 ] + %666 = phi float [ 0.000000e+00, %.lr.ph ], [ %1549, %510 ] + %667 = phi float [ 0.000000e+00, %.lr.ph ], [ %1552, %510 ] + %668 = phi float [ 0.000000e+00, %.lr.ph ], [ %1553, %510 ] + %669 = phi float [ 0.000000e+00, %.lr.ph ], [ %1554, %510 ] + %670 = phi float [ 0.000000e+00, %.lr.ph ], [ %1555, %510 ] + %671 = phi float [ 0.000000e+00, %.lr.ph ], [ %1396, %510 ] + %672 = phi float [ 0.000000e+00, %.lr.ph ], [ %1397, %510 ] + %673 = phi float [ 0.000000e+00, %.lr.ph ], [ %1398, %510 ] + %674 = phi float [ 0.000000e+00, %.lr.ph ], [ %1399, %510 ] + %675 = phi float [ 0.000000e+00, %.lr.ph ], [ %1402, %510 ] + %676 = phi float [ 0.000000e+00, %.lr.ph ], [ %1403, %510 ] + %677 = phi float [ 0.000000e+00, %.lr.ph ], [ %1404, %510 ] + %678 = phi float [ 0.000000e+00, %.lr.ph ], [ %1405, %510 ] + %679 = phi float [ 0.000000e+00, %.lr.ph ], [ %1408, %510 ] + %680 = phi float [ 0.000000e+00, %.lr.ph ], [ %1409, %510 ] + %681 = phi float [ 0.000000e+00, %.lr.ph ], [ %1410, %510 ] + %682 = phi float [ 0.000000e+00, %.lr.ph ], [ %1411, %510 ] + %683 = phi float [ 0.000000e+00, %.lr.ph ], [ %1414, %510 ] + %684 = phi float [ 0.000000e+00, %.lr.ph ], [ %1415, %510 ] + %685 = phi float [ 0.000000e+00, %.lr.ph ], [ %1416, %510 ] + %686 = phi float [ 0.000000e+00, %.lr.ph ], [ %1417, %510 ] + %687 = phi float [ 0.000000e+00, %.lr.ph ], [ %1372, %510 ] + %688 = phi float [ 0.000000e+00, %.lr.ph ], [ %1373, %510 ] + %689 = phi float [ 0.000000e+00, %.lr.ph ], [ %1374, %510 ] + %690 = phi float [ 0.000000e+00, %.lr.ph ], [ %1375, %510 ] + %691 = phi float [ 0.000000e+00, %.lr.ph ], [ %1378, %510 ] + %692 = phi float [ 0.000000e+00, %.lr.ph ], [ %1379, %510 ] + %693 = phi float [ 0.000000e+00, %.lr.ph ], [ %1380, %510 ] + %694 = phi float [ 0.000000e+00, %.lr.ph ], [ %1381, %510 ] + %695 = phi float [ 0.000000e+00, %.lr.ph ], [ %1384, %510 ] + %696 = phi float [ 0.000000e+00, %.lr.ph ], [ %1385, %510 ] + %697 = phi float [ 0.000000e+00, %.lr.ph ], [ %1386, %510 ] + %698 = phi float [ 0.000000e+00, %.lr.ph ], [ %1387, %510 ] + %699 = phi float [ 0.000000e+00, %.lr.ph ], [ %1390, %510 ] + %700 = phi float [ 0.000000e+00, %.lr.ph ], [ %1391, %510 ] + %701 = phi float [ 0.000000e+00, %.lr.ph ], [ %1392, %510 ] + %702 = phi float [ 0.000000e+00, %.lr.ph ], [ %1393, %510 ] + %703 = phi float [ 0.000000e+00, %.lr.ph ], [ %1510, %510 ] + %704 = phi float [ 0.000000e+00, %.lr.ph ], [ %1511, %510 ] + %705 = phi float [ 0.000000e+00, %.lr.ph ], [ %1512, %510 ] + %706 = phi float [ 0.000000e+00, %.lr.ph ], [ %1513, %510 ] + %707 = phi float [ 0.000000e+00, %.lr.ph ], [ %1516, %510 ] + %708 = phi float [ 0.000000e+00, %.lr.ph ], [ %1517, %510 ] + %709 = phi float [ 0.000000e+00, %.lr.ph ], [ %1518, %510 ] + %710 = phi float [ 0.000000e+00, %.lr.ph ], [ %1519, %510 ] + %711 = phi float [ 0.000000e+00, %.lr.ph ], [ %1522, %510 ] + %712 = phi float [ 0.000000e+00, %.lr.ph ], [ %1523, %510 ] + %713 = phi float [ 0.000000e+00, %.lr.ph ], [ %1524, %510 ] + %714 = phi float [ 0.000000e+00, %.lr.ph ], [ %1525, %510 ] + %715 = phi float [ 0.000000e+00, %.lr.ph ], [ %1528, %510 ] + %716 = phi float [ 0.000000e+00, %.lr.ph ], [ %1529, %510 ] + %717 = phi float [ 0.000000e+00, %.lr.ph ], [ %1530, %510 ] + %718 = phi float [ 0.000000e+00, %.lr.ph ], [ %1531, %510 ] + %719 = phi float [ 0.000000e+00, %.lr.ph ], [ %1482, %510 ] + %720 = phi float [ 0.000000e+00, %.lr.ph ], [ %1483, %510 ] + %721 = phi float [ 0.000000e+00, %.lr.ph ], [ %1484, %510 ] + %722 = phi float [ 0.000000e+00, %.lr.ph ], [ %1485, %510 ] + %723 = phi float [ 0.000000e+00, %.lr.ph ], [ %1488, %510 ] + %724 = phi float [ 0.000000e+00, %.lr.ph ], [ %1489, %510 ] + %725 = phi float [ 0.000000e+00, %.lr.ph ], [ %1490, %510 ] + %726 = phi float [ 0.000000e+00, %.lr.ph ], [ %1491, %510 ] + %727 = phi float [ 0.000000e+00, %.lr.ph ], [ %1494, %510 ] + %728 = phi float [ 0.000000e+00, %.lr.ph ], [ %1495, %510 ] + %729 = phi float [ 0.000000e+00, %.lr.ph ], [ %1496, %510 ] + %730 = phi float [ 0.000000e+00, %.lr.ph ], [ %1497, %510 ] + %731 = phi float [ 0.000000e+00, %.lr.ph ], [ %1500, %510 ] + %732 = phi float [ 0.000000e+00, %.lr.ph ], [ %1501, %510 ] + %733 = phi float [ 0.000000e+00, %.lr.ph ], [ %1502, %510 ] + %734 = phi float [ 0.000000e+00, %.lr.ph ], [ %1503, %510 ] + %735 = phi float [ 0.000000e+00, %.lr.ph ], [ %1340, %510 ] + %736 = phi float [ 0.000000e+00, %.lr.ph ], [ %1341, %510 ] + %737 = phi float [ 0.000000e+00, %.lr.ph ], [ %1342, %510 ] + %738 = phi float [ 0.000000e+00, %.lr.ph ], [ %1343, %510 ] + %739 = phi float [ 0.000000e+00, %.lr.ph ], [ %1346, %510 ] + %740 = phi float [ 0.000000e+00, %.lr.ph ], [ %1347, %510 ] + %741 = phi float [ 0.000000e+00, %.lr.ph ], [ %1348, %510 ] + %742 = phi float [ 0.000000e+00, %.lr.ph ], [ %1349, %510 ] + %743 = phi float [ 0.000000e+00, %.lr.ph ], [ %1352, %510 ] + %744 = phi float [ 0.000000e+00, %.lr.ph ], [ %1353, %510 ] + %745 = phi float [ 0.000000e+00, %.lr.ph ], [ %1354, %510 ] + %746 = phi float [ 0.000000e+00, %.lr.ph ], [ %1355, %510 ] + %747 = phi float [ 0.000000e+00, %.lr.ph ], [ %1358, %510 ] + %748 = phi float [ 0.000000e+00, %.lr.ph ], [ %1359, %510 ] + %749 = phi float [ 0.000000e+00, %.lr.ph ], [ %1360, %510 ] + %750 = phi float [ 0.000000e+00, %.lr.ph ], [ %1361, %510 ] + %751 = phi ptr addrspace(1) [ %138, %.lr.ph ], [ %1620, %510 ] + %752 = phi ptr addrspace(1) [ %64, %.lr.ph ], [ %1458, %510 ] + %753 = phi float [ 0.000000e+00, %.lr.ph ], [ %1308, %510 ] + %754 = phi float [ 0.000000e+00, %.lr.ph ], [ %1309, %510 ] + %755 = phi float [ 0.000000e+00, %.lr.ph ], [ %1310, %510 ] + %756 = phi float [ 0.000000e+00, %.lr.ph ], [ %1311, %510 ] + %757 = phi float [ 0.000000e+00, %.lr.ph ], [ %1314, %510 ] + %758 = phi float [ 0.000000e+00, %.lr.ph ], [ %1315, %510 ] + %759 = phi float [ 0.000000e+00, %.lr.ph ], [ %1316, %510 ] + %760 = phi float [ 0.000000e+00, %.lr.ph ], [ %1317, %510 ] + %761 = phi float [ 0.000000e+00, %.lr.ph ], [ %1320, %510 ] + %762 = phi float [ 0.000000e+00, %.lr.ph ], [ %1321, %510 ] + %763 = phi float [ 0.000000e+00, %.lr.ph ], [ %1322, %510 ] + %764 = phi float [ 0.000000e+00, %.lr.ph ], [ %1323, %510 ] + %765 = phi float [ 0.000000e+00, %.lr.ph ], [ %1326, %510 ] + %766 = phi float [ 0.000000e+00, %.lr.ph ], [ %1327, %510 ] + %767 = phi float [ 0.000000e+00, %.lr.ph ], [ %1328, %510 ] + %768 = phi float [ 0.000000e+00, %.lr.ph ], [ %1329, %510 ] + %769 = phi i32 [ 0, %.lr.ph ], [ %1846, %510 ] + %770 = phi <2 x half> [ %414, %.lr.ph ], [ %1910, %510 ] + %771 = phi <2 x half> [ %415, %.lr.ph ], [ %1909, %510 ] + %772 = phi <2 x half> [ %416, %.lr.ph ], [ %1908, %510 ] + %773 = phi <2 x half> [ %417, %.lr.ph ], [ %1907, %510 ] + %774 = phi <2 x half> [ %418, %.lr.ph ], [ %1906, %510 ] + %775 = phi <2 x half> [ %419, %.lr.ph ], [ %1905, %510 ] + %776 = phi <2 x half> [ %420, %.lr.ph ], [ %1904, %510 ] + %777 = phi <2 x half> [ %421, %.lr.ph ], [ %1903, %510 ] + %778 = phi <2 x half> [ %422, %.lr.ph ], [ %1902, %510 ] + %779 = phi <2 x half> [ %423, %.lr.ph ], [ %1901, %510 ] + %780 = phi <2 x half> [ %424, %.lr.ph ], [ %1900, %510 ] + %781 = phi <2 x half> [ %425, %.lr.ph ], [ %1899, %510 ] + %782 = phi <2 x half> [ %426, %.lr.ph ], [ %1898, %510 ] + %783 = phi <2 x half> [ %427, %.lr.ph ], [ %1897, %510 ] + %784 = phi <2 x half> [ %428, %.lr.ph ], [ %1896, %510 ] + %785 = phi <2 x half> [ %429, %.lr.ph ], [ %1895, %510 ] + %786 = phi <2 x half> [ %430, %.lr.ph ], [ %1894, %510 ] + %787 = phi <2 x half> [ %431, %.lr.ph ], [ %1893, %510 ] + %788 = phi <2 x half> [ %432, %.lr.ph ], [ %1892, %510 ] + %789 = phi <2 x half> [ %433, %.lr.ph ], [ %1891, %510 ] + %790 = phi <2 x half> [ %434, %.lr.ph ], [ %1890, %510 ] + %791 = phi <2 x half> [ %435, %.lr.ph ], [ %1889, %510 ] + %792 = phi <2 x half> [ %436, %.lr.ph ], [ %1888, %510 ] + %793 = phi <2 x half> [ %437, %.lr.ph ], [ %1887, %510 ] + %794 = phi <2 x half> [ %438, %.lr.ph ], [ %1886, %510 ] + %795 = phi <2 x half> [ %439, %.lr.ph ], [ %1885, %510 ] + %796 = phi <2 x half> [ %440, %.lr.ph ], [ %1884, %510 ] + %797 = phi <2 x half> [ %441, %.lr.ph ], [ %1883, %510 ] + %798 = phi <2 x half> [ %442, %.lr.ph ], [ %1882, %510 ] + %799 = phi <2 x half> [ %443, %.lr.ph ], [ %1881, %510 ] + %800 = phi <2 x half> [ %444, %.lr.ph ], [ %1880, %510 ] + %801 = phi <2 x half> [ %445, %.lr.ph ], [ %1879, %510 ] + %802 = phi <2 x half> [ %446, %.lr.ph ], [ %1878, %510 ] + %803 = phi <2 x half> [ %447, %.lr.ph ], [ %1942, %510 ] + %804 = phi <2 x half> [ %448, %.lr.ph ], [ %1941, %510 ] + %805 = phi <2 x half> [ %449, %.lr.ph ], [ %1877, %510 ] + %806 = phi <2 x half> [ %450, %.lr.ph ], [ %1876, %510 ] + %807 = phi <2 x half> [ %451, %.lr.ph ], [ %1940, %510 ] + %808 = phi <2 x half> [ %452, %.lr.ph ], [ %1939, %510 ] + %809 = phi <2 x half> [ %453, %.lr.ph ], [ %1875, %510 ] + %810 = phi <2 x half> [ %454, %.lr.ph ], [ %1874, %510 ] + %811 = phi <2 x half> [ %455, %.lr.ph ], [ %1938, %510 ] + %812 = phi <2 x half> [ %456, %.lr.ph ], [ %1937, %510 ] + %813 = phi <2 x half> [ %457, %.lr.ph ], [ %1873, %510 ] + %814 = phi <2 x half> [ %458, %.lr.ph ], [ %1872, %510 ] + %815 = phi <2 x half> [ %459, %.lr.ph ], [ %1936, %510 ] + %816 = phi <2 x half> [ %460, %.lr.ph ], [ %1935, %510 ] + %817 = phi <2 x half> [ %461, %.lr.ph ], [ %1871, %510 ] + %818 = phi <2 x half> [ %462, %.lr.ph ], [ %1870, %510 ] + %819 = phi <2 x half> [ %463, %.lr.ph ], [ %1934, %510 ] + %820 = phi <2 x half> [ %464, %.lr.ph ], [ %1933, %510 ] + %821 = phi <2 x half> [ %465, %.lr.ph ], [ %1869, %510 ] + %822 = phi <2 x half> [ %466, %.lr.ph ], [ %1868, %510 ] + %823 = phi <2 x half> [ %467, %.lr.ph ], [ %1932, %510 ] + %824 = phi <2 x half> [ %468, %.lr.ph ], [ %1931, %510 ] + %825 = phi <2 x half> [ %469, %.lr.ph ], [ %1867, %510 ] + %826 = phi <2 x half> [ %470, %.lr.ph ], [ %1866, %510 ] + %827 = phi <2 x half> [ %471, %.lr.ph ], [ %1930, %510 ] + %828 = phi <2 x half> [ %472, %.lr.ph ], [ %1929, %510 ] + %829 = phi <2 x half> [ %473, %.lr.ph ], [ %1865, %510 ] + %830 = phi <2 x half> [ %474, %.lr.ph ], [ %1864, %510 ] + %831 = phi <2 x half> [ %475, %.lr.ph ], [ %1928, %510 ] + %832 = phi <2 x half> [ %476, %.lr.ph ], [ %1927, %510 ] + %833 = phi <2 x half> [ %477, %.lr.ph ], [ %1863, %510 ] + %834 = phi <2 x half> [ %478, %.lr.ph ], [ %1862, %510 ] + %835 = phi <2 x half> [ %479, %.lr.ph ], [ %1926, %510 ] + %836 = phi <2 x half> [ %480, %.lr.ph ], [ %1925, %510 ] + %837 = phi <2 x half> [ %481, %.lr.ph ], [ %1861, %510 ] + %838 = phi <2 x half> [ %482, %.lr.ph ], [ %1860, %510 ] + %839 = phi <2 x half> [ %483, %.lr.ph ], [ %1924, %510 ] + %840 = phi <2 x half> [ %484, %.lr.ph ], [ %1923, %510 ] + %841 = phi <2 x half> [ %485, %.lr.ph ], [ %1859, %510 ] + %842 = phi <2 x half> [ %486, %.lr.ph ], [ %1858, %510 ] + %843 = phi <2 x half> [ %487, %.lr.ph ], [ %1922, %510 ] + %844 = phi <2 x half> [ %488, %.lr.ph ], [ %1921, %510 ] + %845 = phi <2 x half> [ %489, %.lr.ph ], [ %1857, %510 ] + %846 = phi <2 x half> [ %490, %.lr.ph ], [ %1856, %510 ] + %847 = phi <2 x half> [ %491, %.lr.ph ], [ %1920, %510 ] + %848 = phi <2 x half> [ %492, %.lr.ph ], [ %1919, %510 ] + %849 = phi <2 x half> [ %493, %.lr.ph ], [ %1855, %510 ] + %850 = phi <2 x half> [ %494, %.lr.ph ], [ %1854, %510 ] + %851 = phi <2 x half> [ %495, %.lr.ph ], [ %1918, %510 ] + %852 = phi <2 x half> [ %496, %.lr.ph ], [ %1917, %510 ] + %853 = phi <2 x half> [ %497, %.lr.ph ], [ %1853, %510 ] + %854 = phi <2 x half> [ %498, %.lr.ph ], [ %1852, %510 ] + %855 = phi <2 x half> [ %499, %.lr.ph ], [ %1916, %510 ] + %856 = phi <2 x half> [ %500, %.lr.ph ], [ %1915, %510 ] + %857 = phi <2 x half> [ %501, %.lr.ph ], [ %1851, %510 ] + %858 = phi <2 x half> [ %502, %.lr.ph ], [ %1850, %510 ] + %859 = phi <2 x half> [ %503, %.lr.ph ], [ %1914, %510 ] + %860 = phi <2 x half> [ %504, %.lr.ph ], [ %1913, %510 ] + %861 = phi <2 x half> [ %505, %.lr.ph ], [ %1849, %510 ] + %862 = phi <2 x half> [ %506, %.lr.ph ], [ %1848, %510 ] + %863 = phi <2 x half> [ %507, %.lr.ph ], [ %1912, %510 ] + %864 = phi <2 x half> [ %508, %.lr.ph ], [ %1911, %510 ] + %865 = phi <2 x half> [ %509, %.lr.ph ], [ %1847, %510 ] + %866 = shufflevector <2 x half> %801, <2 x half> %800, <4 x i32> + %867 = shufflevector <2 x half> %799, <2 x half> %798, <4 x i32> + %868 = shufflevector <2 x half> %797, <2 x half> %796, <4 x i32> + %869 = shufflevector <2 x half> %795, <2 x half> %794, <4 x i32> + %870 = shufflevector <2 x half> %793, <2 x half> %792, <4 x i32> + %871 = shufflevector <2 x half> %791, <2 x half> %790, <4 x i32> + %872 = shufflevector <2 x half> %789, <2 x half> %788, <4 x i32> + %873 = shufflevector <2 x half> %787, <2 x half> %786, <4 x i32> + %874 = insertelement <4 x float> poison, float %753, i64 0 + %875 = insertelement <4 x float> %874, float %754, i64 1 + %876 = insertelement <4 x float> %875, float %755, i64 2 + %877 = insertelement <4 x float> %876, float %756, i64 3 + %878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %866, <4 x float> %877, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %867, <4 x float> %878, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %880 = insertelement <4 x float> poison, float %757, i64 0 + %881 = insertelement <4 x float> %880, float %758, i64 1 + %882 = insertelement <4 x float> %881, float %759, i64 2 + %883 = insertelement <4 x float> %882, float %760, i64 3 + %884 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %866, <4 x float> %883, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %885 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %867, <4 x float> %884, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %886 = insertelement <4 x float> poison, float %761, i64 0 + %887 = insertelement <4 x float> %886, float %762, i64 1 + %888 = insertelement <4 x float> %887, float %763, i64 2 + %889 = insertelement <4 x float> %888, float %764, i64 3 + %890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %868, <4 x float> %889, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %869, <4 x float> %890, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %892 = insertelement <4 x float> poison, float %765, i64 0 + %893 = insertelement <4 x float> %892, float %766, i64 1 + %894 = insertelement <4 x float> %893, float %767, i64 2 + %895 = insertelement <4 x float> %894, float %768, i64 3 + %896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %868, <4 x float> %895, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %869, <4 x float> %896, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !7 + %898 = shufflevector <2 x half> %777, <2 x half> %776, <4 x i32> + %899 = shufflevector <2 x half> %775, <2 x half> %774, <4 x i32> + %900 = shufflevector <2 x half> %773, <2 x half> %772, <4 x i32> + %901 = shufflevector <2 x half> %771, <2 x half> %770, <4 x i32> + %902 = insertelement <4 x float> poison, float %735, i64 0 + %903 = insertelement <4 x float> %902, float %736, i64 1 + %904 = insertelement <4 x float> %903, float %737, i64 2 + %905 = insertelement <4 x float> %904, float %738, i64 3 + %906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %866, <4 x float> %905, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %867, <4 x float> %906, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %908 = insertelement <4 x float> poison, float %739, i64 0 + %909 = insertelement <4 x float> %908, float %740, i64 1 + %910 = insertelement <4 x float> %909, float %741, i64 2 + %911 = insertelement <4 x float> %910, float %742, i64 3 + %912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %866, <4 x float> %911, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %867, <4 x float> %912, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %914 = insertelement <4 x float> poison, float %743, i64 0 + %915 = insertelement <4 x float> %914, float %744, i64 1 + %916 = insertelement <4 x float> %915, float %745, i64 2 + %917 = insertelement <4 x float> %916, float %746, i64 3 + %918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %868, <4 x float> %917, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %869, <4 x float> %918, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %920 = insertelement <4 x float> poison, float %747, i64 0 + %921 = insertelement <4 x float> %920, float %748, i64 1 + %922 = insertelement <4 x float> %921, float %749, i64 2 + %923 = insertelement <4 x float> %922, float %750, i64 3 + %924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %868, <4 x float> %923, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %869, <4 x float> %924, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !8 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !9 + %926 = load <8 x half>, ptr addrspace(3) %362, align 16 + %927 = load <8 x half>, ptr addrspace(3) %363, align 16 + %928 = load <8 x half>, ptr addrspace(3) %366, align 16 + %929 = load <8 x half>, ptr addrspace(3) %367, align 16 + %930 = shufflevector <2 x half> %785, <2 x half> %784, <4 x i32> + %931 = shufflevector <2 x half> %783, <2 x half> %782, <4 x i32> + %932 = shufflevector <2 x half> %781, <2 x half> %780, <4 x i32> + %933 = shufflevector <2 x half> %779, <2 x half> %778, <4 x i32> + %934 = insertelement <4 x float> poison, float %687, i64 0 + %935 = insertelement <4 x float> %934, float %688, i64 1 + %936 = insertelement <4 x float> %935, float %689, i64 2 + %937 = insertelement <4 x float> %936, float %690, i64 3 + %938 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %930, <4 x float> %937, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %939 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %931, <4 x float> %938, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %940 = insertelement <4 x float> poison, float %691, i64 0 + %941 = insertelement <4 x float> %940, float %692, i64 1 + %942 = insertelement <4 x float> %941, float %693, i64 2 + %943 = insertelement <4 x float> %942, float %694, i64 3 + %944 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %930, <4 x float> %943, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %945 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %931, <4 x float> %944, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %946 = insertelement <4 x float> poison, float %695, i64 0 + %947 = insertelement <4 x float> %946, float %696, i64 1 + %948 = insertelement <4 x float> %947, float %697, i64 2 + %949 = insertelement <4 x float> %948, float %698, i64 3 + %950 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %932, <4 x float> %949, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %951 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %933, <4 x float> %950, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %952 = insertelement <4 x float> poison, float %699, i64 0 + %953 = insertelement <4 x float> %952, float %700, i64 1 + %954 = insertelement <4 x float> %953, float %701, i64 2 + %955 = insertelement <4 x float> %954, float %702, i64 3 + %956 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %932, <4 x float> %955, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %957 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %933, <4 x float> %956, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !10 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !11 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !12 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !13 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !14 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !15 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !16 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !17 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !18 + %958 = insertelement <4 x float> poison, float %671, i64 0 + %959 = insertelement <4 x float> %958, float %672, i64 1 + %960 = insertelement <4 x float> %959, float %673, i64 2 + %961 = insertelement <4 x float> %960, float %674, i64 3 + %962 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %930, <4 x float> %961, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %963 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %931, <4 x float> %962, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %964 = insertelement <4 x float> poison, float %675, i64 0 + %965 = insertelement <4 x float> %964, float %676, i64 1 + %966 = insertelement <4 x float> %965, float %677, i64 2 + %967 = insertelement <4 x float> %966, float %678, i64 3 + %968 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %930, <4 x float> %967, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %969 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %931, <4 x float> %968, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %970 = insertelement <4 x float> poison, float %679, i64 0 + %971 = insertelement <4 x float> %970, float %680, i64 1 + %972 = insertelement <4 x float> %971, float %681, i64 2 + %973 = insertelement <4 x float> %972, float %682, i64 3 + %974 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %932, <4 x float> %973, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %975 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %933, <4 x float> %974, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %976 = insertelement <4 x float> poison, float %683, i64 0 + %977 = insertelement <4 x float> %976, float %684, i64 1 + %978 = insertelement <4 x float> %977, float %685, i64 2 + %979 = insertelement <4 x float> %978, float %686, i64 3 + %980 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %932, <4 x float> %979, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %981 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %933, <4 x float> %980, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !19 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !20 + %982 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> + %983 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> + %984 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> + %985 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> + %986 = insertelement <4 x float> poison, float %719, i64 0 + %987 = insertelement <4 x float> %986, float %720, i64 1 + %988 = insertelement <4 x float> %987, float %721, i64 2 + %989 = insertelement <4 x float> %988, float %722, i64 3 + %990 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %866, <4 x float> %989, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %991 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %867, <4 x float> %990, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %992 = insertelement <4 x float> poison, float %723, i64 0 + %993 = insertelement <4 x float> %992, float %724, i64 1 + %994 = insertelement <4 x float> %993, float %725, i64 2 + %995 = insertelement <4 x float> %994, float %726, i64 3 + %996 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %866, <4 x float> %995, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %997 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %867, <4 x float> %996, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %998 = insertelement <4 x float> poison, float %727, i64 0 + %999 = insertelement <4 x float> %998, float %728, i64 1 + %1000 = insertelement <4 x float> %999, float %729, i64 2 + %1001 = insertelement <4 x float> %1000, float %730, i64 3 + %1002 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %868, <4 x float> %1001, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1003 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %869, <4 x float> %1002, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1004 = insertelement <4 x float> poison, float %731, i64 0 + %1005 = insertelement <4 x float> %1004, float %732, i64 1 + %1006 = insertelement <4 x float> %1005, float %733, i64 2 + %1007 = insertelement <4 x float> %1006, float %734, i64 3 + %1008 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %868, <4 x float> %1007, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1009 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %869, <4 x float> %1008, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !21 + %1010 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> + %1011 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> + %1012 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> + %1013 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> + %1014 = insertelement <4 x float> poison, float %703, i64 0 + %1015 = insertelement <4 x float> %1014, float %704, i64 1 + %1016 = insertelement <4 x float> %1015, float %705, i64 2 + %1017 = insertelement <4 x float> %1016, float %706, i64 3 + %1018 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %866, <4 x float> %1017, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1019 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %867, <4 x float> %1018, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1020 = insertelement <4 x float> poison, float %707, i64 0 + %1021 = insertelement <4 x float> %1020, float %708, i64 1 + %1022 = insertelement <4 x float> %1021, float %709, i64 2 + %1023 = insertelement <4 x float> %1022, float %710, i64 3 + %1024 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %866, <4 x float> %1023, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1025 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %867, <4 x float> %1024, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1026 = insertelement <4 x float> poison, float %711, i64 0 + %1027 = insertelement <4 x float> %1026, float %712, i64 1 + %1028 = insertelement <4 x float> %1027, float %713, i64 2 + %1029 = insertelement <4 x float> %1028, float %714, i64 3 + %1030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %868, <4 x float> %1029, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %869, <4 x float> %1030, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1032 = insertelement <4 x float> poison, float %715, i64 0 + %1033 = insertelement <4 x float> %1032, float %716, i64 1 + %1034 = insertelement <4 x float> %1033, float %717, i64 2 + %1035 = insertelement <4 x float> %1034, float %718, i64 3 + %1036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %868, <4 x float> %1035, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %869, <4 x float> %1036, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !22 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !23 + %1038 = load <8 x half>, ptr addrspace(3) %372, align 16 + %1039 = load <8 x half>, ptr addrspace(3) %373, align 16 + %1040 = load <8 x half>, ptr addrspace(3) %378, align 16 + %1041 = load <8 x half>, ptr addrspace(3) %379, align 16 + %1042 = insertelement <4 x float> poison, float %655, i64 0 + %1043 = insertelement <4 x float> %1042, float %656, i64 1 + %1044 = insertelement <4 x float> %1043, float %657, i64 2 + %1045 = insertelement <4 x float> %1044, float %658, i64 3 + %1046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %930, <4 x float> %1045, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %931, <4 x float> %1046, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1048 = insertelement <4 x float> poison, float %659, i64 0 + %1049 = insertelement <4 x float> %1048, float %660, i64 1 + %1050 = insertelement <4 x float> %1049, float %661, i64 2 + %1051 = insertelement <4 x float> %1050, float %662, i64 3 + %1052 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %930, <4 x float> %1051, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1053 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %931, <4 x float> %1052, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1054 = insertelement <4 x float> poison, float %663, i64 0 + %1055 = insertelement <4 x float> %1054, float %664, i64 1 + %1056 = insertelement <4 x float> %1055, float %665, i64 2 + %1057 = insertelement <4 x float> %1056, float %666, i64 3 + %1058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %932, <4 x float> %1057, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %933, <4 x float> %1058, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1060 = insertelement <4 x float> poison, float %667, i64 0 + %1061 = insertelement <4 x float> %1060, float %668, i64 1 + %1062 = insertelement <4 x float> %1061, float %669, i64 2 + %1063 = insertelement <4 x float> %1062, float %670, i64 3 + %1064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %932, <4 x float> %1063, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %933, <4 x float> %1064, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !24 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !25 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !26 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !27 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !28 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !29 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !30 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !31 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !32 + %1066 = insertelement <4 x float> poison, float %639, i64 0 + %1067 = insertelement <4 x float> %1066, float %640, i64 1 + %1068 = insertelement <4 x float> %1067, float %641, i64 2 + %1069 = insertelement <4 x float> %1068, float %642, i64 3 + %1070 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %930, <4 x float> %1069, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1071 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %931, <4 x float> %1070, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1072 = insertelement <4 x float> poison, float %643, i64 0 + %1073 = insertelement <4 x float> %1072, float %644, i64 1 + %1074 = insertelement <4 x float> %1073, float %645, i64 2 + %1075 = insertelement <4 x float> %1074, float %646, i64 3 + %1076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %930, <4 x float> %1075, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %931, <4 x float> %1076, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1078 = insertelement <4 x float> poison, float %647, i64 0 + %1079 = insertelement <4 x float> %1078, float %648, i64 1 + %1080 = insertelement <4 x float> %1079, float %649, i64 2 + %1081 = insertelement <4 x float> %1080, float %650, i64 3 + %1082 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %932, <4 x float> %1081, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1083 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %933, <4 x float> %1082, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1084 = insertelement <4 x float> poison, float %651, i64 0 + %1085 = insertelement <4 x float> %1084, float %652, i64 1 + %1086 = insertelement <4 x float> %1085, float %653, i64 2 + %1087 = insertelement <4 x float> %1086, float %654, i64 3 + %1088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %932, <4 x float> %1087, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %933, <4 x float> %1088, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !33 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !34 + %1090 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> + %1091 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> + %1092 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> + %1093 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> + %1094 = insertelement <4 x float> poison, float %623, i64 0 + %1095 = insertelement <4 x float> %1094, float %624, i64 1 + %1096 = insertelement <4 x float> %1095, float %625, i64 2 + %1097 = insertelement <4 x float> %1096, float %626, i64 3 + %1098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1090, <4 x float> %1097, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1091, <4 x float> %1098, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1100 = insertelement <4 x float> poison, float %627, i64 0 + %1101 = insertelement <4 x float> %1100, float %628, i64 1 + %1102 = insertelement <4 x float> %1101, float %629, i64 2 + %1103 = insertelement <4 x float> %1102, float %630, i64 3 + %1104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1090, <4 x float> %1103, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1091, <4 x float> %1104, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1106 = insertelement <4 x float> poison, float %631, i64 0 + %1107 = insertelement <4 x float> %1106, float %632, i64 1 + %1108 = insertelement <4 x float> %1107, float %633, i64 2 + %1109 = insertelement <4 x float> %1108, float %634, i64 3 + %1110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1092, <4 x float> %1109, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1093, <4 x float> %1110, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1112 = insertelement <4 x float> poison, float %635, i64 0 + %1113 = insertelement <4 x float> %1112, float %636, i64 1 + %1114 = insertelement <4 x float> %1113, float %637, i64 2 + %1115 = insertelement <4 x float> %1114, float %638, i64 3 + %1116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1092, <4 x float> %1115, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1093, <4 x float> %1116, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1118 = insertelement <4 x float> poison, float %607, i64 0 + %1119 = insertelement <4 x float> %1118, float %608, i64 1 + %1120 = insertelement <4 x float> %1119, float %609, i64 2 + %1121 = insertelement <4 x float> %1120, float %610, i64 3 + %1122 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1090, <4 x float> %1121, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1123 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1091, <4 x float> %1122, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1124 = insertelement <4 x float> poison, float %611, i64 0 + %1125 = insertelement <4 x float> %1124, float %612, i64 1 + %1126 = insertelement <4 x float> %1125, float %613, i64 2 + %1127 = insertelement <4 x float> %1126, float %614, i64 3 + %1128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1090, <4 x float> %1127, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1091, <4 x float> %1128, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1130 = insertelement <4 x float> poison, float %615, i64 0 + %1131 = insertelement <4 x float> %1130, float %616, i64 1 + %1132 = insertelement <4 x float> %1131, float %617, i64 2 + %1133 = insertelement <4 x float> %1132, float %618, i64 3 + %1134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1092, <4 x float> %1133, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1093, <4 x float> %1134, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1136 = insertelement <4 x float> poison, float %619, i64 0 + %1137 = insertelement <4 x float> %1136, float %620, i64 1 + %1138 = insertelement <4 x float> %1137, float %621, i64 2 + %1139 = insertelement <4 x float> %1138, float %622, i64 3 + %1140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1092, <4 x float> %1139, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1093, <4 x float> %1140, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1142 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> + %1143 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> + %1144 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> + %1145 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> + %1146 = insertelement <4 x float> poison, float %559, i64 0 + %1147 = insertelement <4 x float> %1146, float %560, i64 1 + %1148 = insertelement <4 x float> %1147, float %561, i64 2 + %1149 = insertelement <4 x float> %1148, float %562, i64 3 + %1150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1142, <4 x float> %1149, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1143, <4 x float> %1150, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1152 = insertelement <4 x float> poison, float %563, i64 0 + %1153 = insertelement <4 x float> %1152, float %564, i64 1 + %1154 = insertelement <4 x float> %1153, float %565, i64 2 + %1155 = insertelement <4 x float> %1154, float %566, i64 3 + %1156 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1142, <4 x float> %1155, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1157 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1143, <4 x float> %1156, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1158 = insertelement <4 x float> poison, float %567, i64 0 + %1159 = insertelement <4 x float> %1158, float %568, i64 1 + %1160 = insertelement <4 x float> %1159, float %569, i64 2 + %1161 = insertelement <4 x float> %1160, float %570, i64 3 + %1162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1144, <4 x float> %1161, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1145, <4 x float> %1162, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1164 = insertelement <4 x float> poison, float %571, i64 0 + %1165 = insertelement <4 x float> %1164, float %572, i64 1 + %1166 = insertelement <4 x float> %1165, float %573, i64 2 + %1167 = insertelement <4 x float> %1166, float %574, i64 3 + %1168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1144, <4 x float> %1167, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1145, <4 x float> %1168, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1170 = insertelement <4 x float> poison, float %543, i64 0 + %1171 = insertelement <4 x float> %1170, float %544, i64 1 + %1172 = insertelement <4 x float> %1171, float %545, i64 2 + %1173 = insertelement <4 x float> %1172, float %546, i64 3 + %1174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1142, <4 x float> %1173, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1143, <4 x float> %1174, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1176 = insertelement <4 x float> poison, float %547, i64 0 + %1177 = insertelement <4 x float> %1176, float %548, i64 1 + %1178 = insertelement <4 x float> %1177, float %549, i64 2 + %1179 = insertelement <4 x float> %1178, float %550, i64 3 + %1180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1142, <4 x float> %1179, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1143, <4 x float> %1180, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1182 = insertelement <4 x float> poison, float %551, i64 0 + %1183 = insertelement <4 x float> %1182, float %552, i64 1 + %1184 = insertelement <4 x float> %1183, float %553, i64 2 + %1185 = insertelement <4 x float> %1184, float %554, i64 3 + %1186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1144, <4 x float> %1185, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1145, <4 x float> %1186, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1188 = insertelement <4 x float> poison, float %555, i64 0 + %1189 = insertelement <4 x float> %1188, float %556, i64 1 + %1190 = insertelement <4 x float> %1189, float %557, i64 2 + %1191 = insertelement <4 x float> %1190, float %558, i64 3 + %1192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1144, <4 x float> %1191, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1145, <4 x float> %1192, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !35 + %1194 = insertelement <4 x float> poison, float %591, i64 0 + %1195 = insertelement <4 x float> %1194, float %592, i64 1 + %1196 = insertelement <4 x float> %1195, float %593, i64 2 + %1197 = insertelement <4 x float> %1196, float %594, i64 3 + %1198 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1090, <4 x float> %1197, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1199 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1091, <4 x float> %1198, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1200 = insertelement <4 x float> poison, float %595, i64 0 + %1201 = insertelement <4 x float> %1200, float %596, i64 1 + %1202 = insertelement <4 x float> %1201, float %597, i64 2 + %1203 = insertelement <4 x float> %1202, float %598, i64 3 + %1204 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1090, <4 x float> %1203, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1205 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1091, <4 x float> %1204, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1206 = insertelement <4 x float> poison, float %599, i64 0 + %1207 = insertelement <4 x float> %1206, float %600, i64 1 + %1208 = insertelement <4 x float> %1207, float %601, i64 2 + %1209 = insertelement <4 x float> %1208, float %602, i64 3 + %1210 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1092, <4 x float> %1209, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1211 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1093, <4 x float> %1210, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1212 = insertelement <4 x float> poison, float %603, i64 0 + %1213 = insertelement <4 x float> %1212, float %604, i64 1 + %1214 = insertelement <4 x float> %1213, float %605, i64 2 + %1215 = insertelement <4 x float> %1214, float %606, i64 3 + %1216 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1092, <4 x float> %1215, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1217 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1093, <4 x float> %1216, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !36 + %1218 = load <8 x half>, ptr addrspace(3) %385, align 16 + %1219 = load <8 x half>, ptr addrspace(3) %386, align 16 + %1220 = load <8 x half>, ptr addrspace(3) %388, align 16 + %1221 = load <8 x half>, ptr addrspace(3) %389, align 16 + %1222 = insertelement <4 x float> poison, float %575, i64 0 + %1223 = insertelement <4 x float> %1222, float %576, i64 1 + %1224 = insertelement <4 x float> %1223, float %577, i64 2 + %1225 = insertelement <4 x float> %1224, float %578, i64 3 + %1226 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1090, <4 x float> %1225, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1227 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1091, <4 x float> %1226, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1228 = insertelement <4 x float> poison, float %579, i64 0 + %1229 = insertelement <4 x float> %1228, float %580, i64 1 + %1230 = insertelement <4 x float> %1229, float %581, i64 2 + %1231 = insertelement <4 x float> %1230, float %582, i64 3 + %1232 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1090, <4 x float> %1231, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1233 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1091, <4 x float> %1232, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1234 = insertelement <4 x float> poison, float %583, i64 0 + %1235 = insertelement <4 x float> %1234, float %584, i64 1 + %1236 = insertelement <4 x float> %1235, float %585, i64 2 + %1237 = insertelement <4 x float> %1236, float %586, i64 3 + %1238 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1092, <4 x float> %1237, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1239 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1093, <4 x float> %1238, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1240 = insertelement <4 x float> poison, float %587, i64 0 + %1241 = insertelement <4 x float> %1240, float %588, i64 1 + %1242 = insertelement <4 x float> %1241, float %589, i64 2 + %1243 = insertelement <4 x float> %1242, float %590, i64 3 + %1244 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1092, <4 x float> %1243, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1245 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1093, <4 x float> %1244, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !37 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !38 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !39 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !40 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !41 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !42 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !43 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !44 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !45 + %1246 = load <8 x half>, ptr addrspace(3) %391, align 16 + %1247 = load <8 x half>, ptr addrspace(3) %392, align 16 + %1248 = load <8 x half>, ptr addrspace(3) %395, align 16 + %1249 = load <8 x half>, ptr addrspace(3) %396, align 16 + %1250 = insertelement <4 x float> poison, float %527, i64 0 + %1251 = insertelement <4 x float> %1250, float %528, i64 1 + %1252 = insertelement <4 x float> %1251, float %529, i64 2 + %1253 = insertelement <4 x float> %1252, float %530, i64 3 + %1254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1142, <4 x float> %1253, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1143, <4 x float> %1254, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1256 = insertelement <4 x float> poison, float %531, i64 0 + %1257 = insertelement <4 x float> %1256, float %532, i64 1 + %1258 = insertelement <4 x float> %1257, float %533, i64 2 + %1259 = insertelement <4 x float> %1258, float %534, i64 3 + %1260 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1142, <4 x float> %1259, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1261 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1143, <4 x float> %1260, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1262 = insertelement <4 x float> poison, float %535, i64 0 + %1263 = insertelement <4 x float> %1262, float %536, i64 1 + %1264 = insertelement <4 x float> %1263, float %537, i64 2 + %1265 = insertelement <4 x float> %1264, float %538, i64 3 + %1266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1144, <4 x float> %1265, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1145, <4 x float> %1266, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1268 = insertelement <4 x float> poison, float %539, i64 0 + %1269 = insertelement <4 x float> %1268, float %540, i64 1 + %1270 = insertelement <4 x float> %1269, float %541, i64 2 + %1271 = insertelement <4 x float> %1270, float %542, i64 3 + %1272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1144, <4 x float> %1271, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1145, <4 x float> %1272, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !46 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !47 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !48 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !49 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !50 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !51 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !52 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !53 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !54 + %1274 = insertelement <4 x float> poison, float %511, i64 0 + %1275 = insertelement <4 x float> %1274, float %512, i64 1 + %1276 = insertelement <4 x float> %1275, float %513, i64 2 + %1277 = insertelement <4 x float> %1276, float %514, i64 3 + %1278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1142, <4 x float> %1277, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1143, <4 x float> %1278, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1280 = insertelement <4 x float> poison, float %515, i64 0 + %1281 = insertelement <4 x float> %1280, float %516, i64 1 + %1282 = insertelement <4 x float> %1281, float %517, i64 2 + %1283 = insertelement <4 x float> %1282, float %518, i64 3 + %1284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1142, <4 x float> %1283, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1143, <4 x float> %1284, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1286 = insertelement <4 x float> poison, float %519, i64 0 + %1287 = insertelement <4 x float> %1286, float %520, i64 1 + %1288 = insertelement <4 x float> %1287, float %521, i64 2 + %1289 = insertelement <4 x float> %1288, float %522, i64 3 + %1290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1144, <4 x float> %1289, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1145, <4 x float> %1290, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1292 = insertelement <4 x float> poison, float %523, i64 0 + %1293 = insertelement <4 x float> %1292, float %524, i64 1 + %1294 = insertelement <4 x float> %1293, float %525, i64 2 + %1295 = insertelement <4 x float> %1294, float %526, i64 3 + %1296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1144, <4 x float> %1295, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1145, <4 x float> %1296, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !55 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !56 + %1298 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> + %1299 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> + %1300 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> + %1301 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> + %1302 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> + %1303 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> + %1304 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> + %1305 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> + %1306 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1298, <4 x float> %879, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1307 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1299, <4 x float> %1306, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1308 = extractelement <4 x float> %1307, i64 0 + %1309 = extractelement <4 x float> %1307, i64 1 + %1310 = extractelement <4 x float> %1307, i64 2 + %1311 = extractelement <4 x float> %1307, i64 3 + %1312 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1298, <4 x float> %885, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1313 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1299, <4 x float> %1312, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1314 = extractelement <4 x float> %1313, i64 0 + %1315 = extractelement <4 x float> %1313, i64 1 + %1316 = extractelement <4 x float> %1313, i64 2 + %1317 = extractelement <4 x float> %1313, i64 3 + %1318 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1300, <4 x float> %891, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1319 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1301, <4 x float> %1318, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1320 = extractelement <4 x float> %1319, i64 0 + %1321 = extractelement <4 x float> %1319, i64 1 + %1322 = extractelement <4 x float> %1319, i64 2 + %1323 = extractelement <4 x float> %1319, i64 3 + %1324 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1300, <4 x float> %897, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1325 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1301, <4 x float> %1324, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1326 = extractelement <4 x float> %1325, i64 0 + %1327 = extractelement <4 x float> %1325, i64 1 + %1328 = extractelement <4 x float> %1325, i64 2 + %1329 = extractelement <4 x float> %1325, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !57 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !58 + %1330 = load <8 x half>, ptr addrspace(3) %399, align 16 + %1331 = load <8 x half>, ptr addrspace(3) %400, align 16 + %1332 = load <8 x half>, ptr addrspace(3) %403, align 16 + %1333 = load <8 x half>, ptr addrspace(3) %404, align 16 + %1334 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> + %1335 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> + %1336 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> + %1337 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> + %1338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1298, <4 x float> %907, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1299, <4 x float> %1338, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1340 = extractelement <4 x float> %1339, i64 0 + %1341 = extractelement <4 x float> %1339, i64 1 + %1342 = extractelement <4 x float> %1339, i64 2 + %1343 = extractelement <4 x float> %1339, i64 3 + %1344 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1298, <4 x float> %913, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1345 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1299, <4 x float> %1344, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1346 = extractelement <4 x float> %1345, i64 0 + %1347 = extractelement <4 x float> %1345, i64 1 + %1348 = extractelement <4 x float> %1345, i64 2 + %1349 = extractelement <4 x float> %1345, i64 3 + %1350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1300, <4 x float> %919, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1301, <4 x float> %1350, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1352 = extractelement <4 x float> %1351, i64 0 + %1353 = extractelement <4 x float> %1351, i64 1 + %1354 = extractelement <4 x float> %1351, i64 2 + %1355 = extractelement <4 x float> %1351, i64 3 + %1356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1300, <4 x float> %925, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1301, <4 x float> %1356, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1358 = extractelement <4 x float> %1357, i64 0 + %1359 = extractelement <4 x float> %1357, i64 1 + %1360 = extractelement <4 x float> %1357, i64 2 + %1361 = extractelement <4 x float> %1357, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !59 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !60 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !61 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !62 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !63 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !64 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !65 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !66 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !67 + %1362 = load <8 x half>, ptr addrspace(3) %407, align 16 + %1363 = load <8 x half>, ptr addrspace(3) %408, align 16 + %1364 = load <8 x half>, ptr addrspace(3) %411, align 16 + %1365 = load <8 x half>, ptr addrspace(3) %412, align 16 + %1366 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> + %1367 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> + %1368 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> + %1369 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> + %1370 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1366, <4 x float> %939, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1371 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1367, <4 x float> %1370, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1372 = extractelement <4 x float> %1371, i64 0 + %1373 = extractelement <4 x float> %1371, i64 1 + %1374 = extractelement <4 x float> %1371, i64 2 + %1375 = extractelement <4 x float> %1371, i64 3 + %1376 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1366, <4 x float> %945, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1377 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1367, <4 x float> %1376, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1378 = extractelement <4 x float> %1377, i64 0 + %1379 = extractelement <4 x float> %1377, i64 1 + %1380 = extractelement <4 x float> %1377, i64 2 + %1381 = extractelement <4 x float> %1377, i64 3 + %1382 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1368, <4 x float> %951, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1383 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1369, <4 x float> %1382, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1384 = extractelement <4 x float> %1383, i64 0 + %1385 = extractelement <4 x float> %1383, i64 1 + %1386 = extractelement <4 x float> %1383, i64 2 + %1387 = extractelement <4 x float> %1383, i64 3 + %1388 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1368, <4 x float> %957, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1389 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1369, <4 x float> %1388, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1390 = extractelement <4 x float> %1389, i64 0 + %1391 = extractelement <4 x float> %1389, i64 1 + %1392 = extractelement <4 x float> %1389, i64 2 + %1393 = extractelement <4 x float> %1389, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !68 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !69 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !70 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !71 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !72 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !73 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !74 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !75 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !76 + %1394 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1366, <4 x float> %963, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1395 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1367, <4 x float> %1394, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1396 = extractelement <4 x float> %1395, i64 0 + %1397 = extractelement <4 x float> %1395, i64 1 + %1398 = extractelement <4 x float> %1395, i64 2 + %1399 = extractelement <4 x float> %1395, i64 3 + %1400 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1366, <4 x float> %969, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1401 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1367, <4 x float> %1400, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1402 = extractelement <4 x float> %1401, i64 0 + %1403 = extractelement <4 x float> %1401, i64 1 + %1404 = extractelement <4 x float> %1401, i64 2 + %1405 = extractelement <4 x float> %1401, i64 3 + %1406 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1368, <4 x float> %975, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1407 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1369, <4 x float> %1406, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1408 = extractelement <4 x float> %1407, i64 0 + %1409 = extractelement <4 x float> %1407, i64 1 + %1410 = extractelement <4 x float> %1407, i64 2 + %1411 = extractelement <4 x float> %1407, i64 3 + %1412 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1368, <4 x float> %981, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1413 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1369, <4 x float> %1412, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1414 = extractelement <4 x float> %1413, i64 0 + %1415 = extractelement <4 x float> %1413, i64 1 + %1416 = extractelement <4 x float> %1413, i64 2 + %1417 = extractelement <4 x float> %1413, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !77 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !78 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %1418 = shufflevector <2 x half> %865, <2 x half> %864, <8 x i32> + %1419 = shufflevector <2 x half> %863, <2 x half> poison, <8 x i32> + %1420 = shufflevector <8 x half> %1418, <8 x half> %1419, <8 x i32> + %1421 = shufflevector <2 x half> %862, <2 x half> poison, <8 x i32> + %1422 = shufflevector <8 x half> %1420, <8 x half> %1421, <8 x i32> + store <8 x half> %1422, ptr addrspace(3) %199, align 16 + %1423 = shufflevector <2 x half> %861, <2 x half> %860, <8 x i32> + %1424 = shufflevector <2 x half> %859, <2 x half> poison, <8 x i32> + %1425 = shufflevector <8 x half> %1423, <8 x half> %1424, <8 x i32> + %1426 = shufflevector <2 x half> %858, <2 x half> poison, <8 x i32> + %1427 = shufflevector <8 x half> %1425, <8 x half> %1426, <8 x i32> + store <8 x half> %1427, ptr addrspace(3) %201, align 16 + %1428 = shufflevector <2 x half> %857, <2 x half> %856, <8 x i32> + %1429 = shufflevector <2 x half> %855, <2 x half> poison, <8 x i32> + %1430 = shufflevector <8 x half> %1428, <8 x half> %1429, <8 x i32> + %1431 = shufflevector <2 x half> %854, <2 x half> poison, <8 x i32> + %1432 = shufflevector <8 x half> %1430, <8 x half> %1431, <8 x i32> + store <8 x half> %1432, ptr addrspace(3) %203, align 16 + %1433 = shufflevector <2 x half> %853, <2 x half> %852, <8 x i32> + %1434 = shufflevector <2 x half> %851, <2 x half> poison, <8 x i32> + %1435 = shufflevector <8 x half> %1433, <8 x half> %1434, <8 x i32> + %1436 = shufflevector <2 x half> %850, <2 x half> poison, <8 x i32> + %1437 = shufflevector <8 x half> %1435, <8 x half> %1436, <8 x i32> + store <8 x half> %1437, ptr addrspace(3) %205, align 16 + %1438 = shufflevector <2 x half> %849, <2 x half> %848, <8 x i32> + %1439 = shufflevector <2 x half> %847, <2 x half> poison, <8 x i32> + %1440 = shufflevector <8 x half> %1438, <8 x half> %1439, <8 x i32> + %1441 = shufflevector <2 x half> %846, <2 x half> poison, <8 x i32> + %1442 = shufflevector <8 x half> %1440, <8 x half> %1441, <8 x i32> + store <8 x half> %1442, ptr addrspace(3) %207, align 16 + %1443 = shufflevector <2 x half> %845, <2 x half> %844, <8 x i32> + %1444 = shufflevector <2 x half> %843, <2 x half> poison, <8 x i32> + %1445 = shufflevector <8 x half> %1443, <8 x half> %1444, <8 x i32> + %1446 = shufflevector <2 x half> %842, <2 x half> poison, <8 x i32> + %1447 = shufflevector <8 x half> %1445, <8 x half> %1446, <8 x i32> + store <8 x half> %1447, ptr addrspace(3) %209, align 16 + %1448 = shufflevector <2 x half> %841, <2 x half> %840, <8 x i32> + %1449 = shufflevector <2 x half> %839, <2 x half> poison, <8 x i32> + %1450 = shufflevector <8 x half> %1448, <8 x half> %1449, <8 x i32> + %1451 = shufflevector <2 x half> %838, <2 x half> poison, <8 x i32> + %1452 = shufflevector <8 x half> %1450, <8 x half> %1451, <8 x i32> + store <8 x half> %1452, ptr addrspace(3) %211, align 16 + %1453 = shufflevector <2 x half> %837, <2 x half> %836, <8 x i32> + %1454 = shufflevector <2 x half> %835, <2 x half> poison, <8 x i32> + %1455 = shufflevector <8 x half> %1453, <8 x half> %1454, <8 x i32> + %1456 = shufflevector <2 x half> %834, <2 x half> poison, <8 x i32> + %1457 = shufflevector <8 x half> %1455, <8 x half> %1456, <8 x i32> + store <8 x half> %1457, ptr addrspace(3) %213, align 16 + %1458 = getelementptr i8, ptr addrspace(1) %752, i64 128 + %1459 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1458, i16 0, i32 2147483646, i32 159744) + %1460 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %68, i32 0, i32 0) + %1461 = bitcast <4 x i32> %1460 to <8 x half> + %1462 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %71, i32 0, i32 0) + %1463 = bitcast <4 x i32> %1462 to <8 x half> + %1464 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %74, i32 0, i32 0) + %1465 = bitcast <4 x i32> %1464 to <8 x half> + %1466 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %77, i32 0, i32 0) + %1467 = bitcast <4 x i32> %1466 to <8 x half> + %1468 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %80, i32 0, i32 0) + %1469 = bitcast <4 x i32> %1468 to <8 x half> + %1470 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %83, i32 0, i32 0) + %1471 = bitcast <4 x i32> %1470 to <8 x half> + %1472 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %86, i32 0, i32 0) + %1473 = bitcast <4 x i32> %1472 to <8 x half> + %1474 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %89, i32 0, i32 0) + %1475 = bitcast <4 x i32> %1474 to <8 x half> + %1476 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> + %1477 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> + %1478 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> + %1479 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> + %1480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1298, <4 x float> %991, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1299, <4 x float> %1480, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1482 = extractelement <4 x float> %1481, i64 0 + %1483 = extractelement <4 x float> %1481, i64 1 + %1484 = extractelement <4 x float> %1481, i64 2 + %1485 = extractelement <4 x float> %1481, i64 3 + %1486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1298, <4 x float> %997, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1299, <4 x float> %1486, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1488 = extractelement <4 x float> %1487, i64 0 + %1489 = extractelement <4 x float> %1487, i64 1 + %1490 = extractelement <4 x float> %1487, i64 2 + %1491 = extractelement <4 x float> %1487, i64 3 + %1492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1300, <4 x float> %1003, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1301, <4 x float> %1492, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1494 = extractelement <4 x float> %1493, i64 0 + %1495 = extractelement <4 x float> %1493, i64 1 + %1496 = extractelement <4 x float> %1493, i64 2 + %1497 = extractelement <4 x float> %1493, i64 3 + %1498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1300, <4 x float> %1009, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1301, <4 x float> %1498, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1500 = extractelement <4 x float> %1499, i64 0 + %1501 = extractelement <4 x float> %1499, i64 1 + %1502 = extractelement <4 x float> %1499, i64 2 + %1503 = extractelement <4 x float> %1499, i64 3 + %1504 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> + %1505 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> + %1506 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> + %1507 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> + %1508 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1298, <4 x float> %1019, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1509 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1299, <4 x float> %1508, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1510 = extractelement <4 x float> %1509, i64 0 + %1511 = extractelement <4 x float> %1509, i64 1 + %1512 = extractelement <4 x float> %1509, i64 2 + %1513 = extractelement <4 x float> %1509, i64 3 + %1514 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1298, <4 x float> %1025, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1515 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1299, <4 x float> %1514, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1516 = extractelement <4 x float> %1515, i64 0 + %1517 = extractelement <4 x float> %1515, i64 1 + %1518 = extractelement <4 x float> %1515, i64 2 + %1519 = extractelement <4 x float> %1515, i64 3 + %1520 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1300, <4 x float> %1031, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1521 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1301, <4 x float> %1520, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1522 = extractelement <4 x float> %1521, i64 0 + %1523 = extractelement <4 x float> %1521, i64 1 + %1524 = extractelement <4 x float> %1521, i64 2 + %1525 = extractelement <4 x float> %1521, i64 3 + %1526 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1300, <4 x float> %1037, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1527 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1301, <4 x float> %1526, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1528 = extractelement <4 x float> %1527, i64 0 + %1529 = extractelement <4 x float> %1527, i64 1 + %1530 = extractelement <4 x float> %1527, i64 2 + %1531 = extractelement <4 x float> %1527, i64 3 + %1532 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1366, <4 x float> %1047, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1533 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1367, <4 x float> %1532, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1534 = extractelement <4 x float> %1533, i64 0 + %1535 = extractelement <4 x float> %1533, i64 1 + %1536 = extractelement <4 x float> %1533, i64 2 + %1537 = extractelement <4 x float> %1533, i64 3 + %1538 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1366, <4 x float> %1053, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1539 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1367, <4 x float> %1538, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1540 = extractelement <4 x float> %1539, i64 0 + %1541 = extractelement <4 x float> %1539, i64 1 + %1542 = extractelement <4 x float> %1539, i64 2 + %1543 = extractelement <4 x float> %1539, i64 3 + %1544 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1368, <4 x float> %1059, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1545 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1369, <4 x float> %1544, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1546 = extractelement <4 x float> %1545, i64 0 + %1547 = extractelement <4 x float> %1545, i64 1 + %1548 = extractelement <4 x float> %1545, i64 2 + %1549 = extractelement <4 x float> %1545, i64 3 + %1550 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1368, <4 x float> %1065, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1551 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1369, <4 x float> %1550, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1552 = extractelement <4 x float> %1551, i64 0 + %1553 = extractelement <4 x float> %1551, i64 1 + %1554 = extractelement <4 x float> %1551, i64 2 + %1555 = extractelement <4 x float> %1551, i64 3 + %1556 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1366, <4 x float> %1071, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1557 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1367, <4 x float> %1556, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1558 = extractelement <4 x float> %1557, i64 0 + %1559 = extractelement <4 x float> %1557, i64 1 + %1560 = extractelement <4 x float> %1557, i64 2 + %1561 = extractelement <4 x float> %1557, i64 3 + %1562 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1366, <4 x float> %1077, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1563 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1367, <4 x float> %1562, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1564 = extractelement <4 x float> %1563, i64 0 + %1565 = extractelement <4 x float> %1563, i64 1 + %1566 = extractelement <4 x float> %1563, i64 2 + %1567 = extractelement <4 x float> %1563, i64 3 + %1568 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1368, <4 x float> %1083, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1569 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1369, <4 x float> %1568, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1570 = extractelement <4 x float> %1569, i64 0 + %1571 = extractelement <4 x float> %1569, i64 1 + %1572 = extractelement <4 x float> %1569, i64 2 + %1573 = extractelement <4 x float> %1569, i64 3 + %1574 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1368, <4 x float> %1089, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1575 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1369, <4 x float> %1574, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1576 = extractelement <4 x float> %1575, i64 0 + %1577 = extractelement <4 x float> %1575, i64 1 + %1578 = extractelement <4 x float> %1575, i64 2 + %1579 = extractelement <4 x float> %1575, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !79 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !80 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !81 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !82 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !83 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !84 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !85 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !86 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !87 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !88 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !89 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !90 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !91 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !92 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !93 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !94 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !95 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !96 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !97 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !98 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !99 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !100 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !101 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !102 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !103 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !104 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !105 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !106 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !107 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !108 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !109 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !110 + tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !111 + %1580 = shufflevector <2 x half> %833, <2 x half> %832, <8 x i32> + %1581 = shufflevector <2 x half> %831, <2 x half> poison, <8 x i32> + %1582 = shufflevector <8 x half> %1580, <8 x half> %1581, <8 x i32> + %1583 = shufflevector <2 x half> %830, <2 x half> poison, <8 x i32> + %1584 = shufflevector <8 x half> %1582, <8 x half> %1583, <8 x i32> + store <8 x half> %1584, ptr addrspace(3) %214, align 16 + %1585 = shufflevector <2 x half> %829, <2 x half> %828, <8 x i32> + %1586 = shufflevector <2 x half> %827, <2 x half> poison, <8 x i32> + %1587 = shufflevector <8 x half> %1585, <8 x half> %1586, <8 x i32> + %1588 = shufflevector <2 x half> %826, <2 x half> poison, <8 x i32> + %1589 = shufflevector <8 x half> %1587, <8 x half> %1588, <8 x i32> + store <8 x half> %1589, ptr addrspace(3) %215, align 16 + %1590 = shufflevector <2 x half> %825, <2 x half> %824, <8 x i32> + %1591 = shufflevector <2 x half> %823, <2 x half> poison, <8 x i32> + %1592 = shufflevector <8 x half> %1590, <8 x half> %1591, <8 x i32> + %1593 = shufflevector <2 x half> %822, <2 x half> poison, <8 x i32> + %1594 = shufflevector <8 x half> %1592, <8 x half> %1593, <8 x i32> + store <8 x half> %1594, ptr addrspace(3) %216, align 16 + %1595 = shufflevector <2 x half> %821, <2 x half> %820, <8 x i32> + %1596 = shufflevector <2 x half> %819, <2 x half> poison, <8 x i32> + %1597 = shufflevector <8 x half> %1595, <8 x half> %1596, <8 x i32> + %1598 = shufflevector <2 x half> %818, <2 x half> poison, <8 x i32> + %1599 = shufflevector <8 x half> %1597, <8 x half> %1598, <8 x i32> + store <8 x half> %1599, ptr addrspace(3) %217, align 16 + %1600 = shufflevector <2 x half> %817, <2 x half> %816, <8 x i32> + %1601 = shufflevector <2 x half> %815, <2 x half> poison, <8 x i32> + %1602 = shufflevector <8 x half> %1600, <8 x half> %1601, <8 x i32> + %1603 = shufflevector <2 x half> %814, <2 x half> poison, <8 x i32> + %1604 = shufflevector <8 x half> %1602, <8 x half> %1603, <8 x i32> + store <8 x half> %1604, ptr addrspace(3) %218, align 16 + %1605 = shufflevector <2 x half> %813, <2 x half> %812, <8 x i32> + %1606 = shufflevector <2 x half> %811, <2 x half> poison, <8 x i32> + %1607 = shufflevector <8 x half> %1605, <8 x half> %1606, <8 x i32> + %1608 = shufflevector <2 x half> %810, <2 x half> poison, <8 x i32> + %1609 = shufflevector <8 x half> %1607, <8 x half> %1608, <8 x i32> + store <8 x half> %1609, ptr addrspace(3) %219, align 16 + %1610 = shufflevector <2 x half> %809, <2 x half> %808, <8 x i32> + %1611 = shufflevector <2 x half> %807, <2 x half> poison, <8 x i32> + %1612 = shufflevector <8 x half> %1610, <8 x half> %1611, <8 x i32> + %1613 = shufflevector <2 x half> %806, <2 x half> poison, <8 x i32> + %1614 = shufflevector <8 x half> %1612, <8 x half> %1613, <8 x i32> + store <8 x half> %1614, ptr addrspace(3) %220, align 16 + %1615 = shufflevector <2 x half> %805, <2 x half> %804, <8 x i32> + %1616 = shufflevector <2 x half> %803, <2 x half> poison, <8 x i32> + %1617 = shufflevector <8 x half> %1615, <8 x half> %1616, <8 x i32> + %1618 = shufflevector <2 x half> %802, <2 x half> poison, <8 x i32> + %1619 = shufflevector <8 x half> %1617, <8 x half> %1618, <8 x i32> + store <8 x half> %1619, ptr addrspace(3) %221, align 16 + %1620 = getelementptr i8, ptr addrspace(1) %751, i64 128 + %1621 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1620, i16 0, i32 2147483646, i32 159744) + %1622 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %140, i32 0, i32 0) + %1623 = bitcast <4 x i32> %1622 to <8 x half> + %1624 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %143, i32 0, i32 0) + %1625 = bitcast <4 x i32> %1624 to <8 x half> + %1626 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %146, i32 0, i32 0) + %1627 = bitcast <4 x i32> %1626 to <8 x half> + %1628 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %149, i32 0, i32 0) + %1629 = bitcast <4 x i32> %1628 to <8 x half> + %1630 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %152, i32 0, i32 0) + %1631 = bitcast <4 x i32> %1630 to <8 x half> + %1632 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %155, i32 0, i32 0) + %1633 = bitcast <4 x i32> %1632 to <8 x half> + %1634 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %158, i32 0, i32 0) + %1635 = bitcast <4 x i32> %1634 to <8 x half> + %1636 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %161, i32 0, i32 0) + %1637 = bitcast <4 x i32> %1636 to <8 x half> + %1638 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> + %1639 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> + %1640 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> + %1641 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> + %1642 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1638, <4 x float> %1099, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1643 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1639, <4 x float> %1642, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1644 = extractelement <4 x float> %1643, i64 0 + %1645 = extractelement <4 x float> %1643, i64 1 + %1646 = extractelement <4 x float> %1643, i64 2 + %1647 = extractelement <4 x float> %1643, i64 3 + %1648 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1638, <4 x float> %1105, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1649 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1639, <4 x float> %1648, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1650 = extractelement <4 x float> %1649, i64 0 + %1651 = extractelement <4 x float> %1649, i64 1 + %1652 = extractelement <4 x float> %1649, i64 2 + %1653 = extractelement <4 x float> %1649, i64 3 + %1654 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1640, <4 x float> %1111, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1655 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1641, <4 x float> %1654, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1656 = extractelement <4 x float> %1655, i64 0 + %1657 = extractelement <4 x float> %1655, i64 1 + %1658 = extractelement <4 x float> %1655, i64 2 + %1659 = extractelement <4 x float> %1655, i64 3 + %1660 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1640, <4 x float> %1117, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1661 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1641, <4 x float> %1660, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1662 = extractelement <4 x float> %1661, i64 0 + %1663 = extractelement <4 x float> %1661, i64 1 + %1664 = extractelement <4 x float> %1661, i64 2 + %1665 = extractelement <4 x float> %1661, i64 3 + %1666 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1638, <4 x float> %1123, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1667 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1639, <4 x float> %1666, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1668 = extractelement <4 x float> %1667, i64 0 + %1669 = extractelement <4 x float> %1667, i64 1 + %1670 = extractelement <4 x float> %1667, i64 2 + %1671 = extractelement <4 x float> %1667, i64 3 + %1672 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1638, <4 x float> %1129, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1673 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1639, <4 x float> %1672, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1674 = extractelement <4 x float> %1673, i64 0 + %1675 = extractelement <4 x float> %1673, i64 1 + %1676 = extractelement <4 x float> %1673, i64 2 + %1677 = extractelement <4 x float> %1673, i64 3 + %1678 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1640, <4 x float> %1135, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1679 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1641, <4 x float> %1678, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1680 = extractelement <4 x float> %1679, i64 0 + %1681 = extractelement <4 x float> %1679, i64 1 + %1682 = extractelement <4 x float> %1679, i64 2 + %1683 = extractelement <4 x float> %1679, i64 3 + %1684 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1640, <4 x float> %1141, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1685 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1641, <4 x float> %1684, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1686 = extractelement <4 x float> %1685, i64 0 + %1687 = extractelement <4 x float> %1685, i64 1 + %1688 = extractelement <4 x float> %1685, i64 2 + %1689 = extractelement <4 x float> %1685, i64 3 + %1690 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> + %1691 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> + %1692 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> + %1693 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> + %1694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1690, <4 x float> %1151, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1691, <4 x float> %1694, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1696 = extractelement <4 x float> %1695, i64 0 + %1697 = extractelement <4 x float> %1695, i64 1 + %1698 = extractelement <4 x float> %1695, i64 2 + %1699 = extractelement <4 x float> %1695, i64 3 + %1700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1690, <4 x float> %1157, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1691, <4 x float> %1700, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1702 = extractelement <4 x float> %1701, i64 0 + %1703 = extractelement <4 x float> %1701, i64 1 + %1704 = extractelement <4 x float> %1701, i64 2 + %1705 = extractelement <4 x float> %1701, i64 3 + %1706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1692, <4 x float> %1163, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1693, <4 x float> %1706, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1708 = extractelement <4 x float> %1707, i64 0 + %1709 = extractelement <4 x float> %1707, i64 1 + %1710 = extractelement <4 x float> %1707, i64 2 + %1711 = extractelement <4 x float> %1707, i64 3 + %1712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1692, <4 x float> %1169, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1693, <4 x float> %1712, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1714 = extractelement <4 x float> %1713, i64 0 + %1715 = extractelement <4 x float> %1713, i64 1 + %1716 = extractelement <4 x float> %1713, i64 2 + %1717 = extractelement <4 x float> %1713, i64 3 + %1718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1690, <4 x float> %1175, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1691, <4 x float> %1718, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1720 = extractelement <4 x float> %1719, i64 0 + %1721 = extractelement <4 x float> %1719, i64 1 + %1722 = extractelement <4 x float> %1719, i64 2 + %1723 = extractelement <4 x float> %1719, i64 3 + %1724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1690, <4 x float> %1181, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1691, <4 x float> %1724, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1726 = extractelement <4 x float> %1725, i64 0 + %1727 = extractelement <4 x float> %1725, i64 1 + %1728 = extractelement <4 x float> %1725, i64 2 + %1729 = extractelement <4 x float> %1725, i64 3 + %1730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1692, <4 x float> %1187, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1693, <4 x float> %1730, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1732 = extractelement <4 x float> %1731, i64 0 + %1733 = extractelement <4 x float> %1731, i64 1 + %1734 = extractelement <4 x float> %1731, i64 2 + %1735 = extractelement <4 x float> %1731, i64 3 + %1736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1692, <4 x float> %1193, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1693, <4 x float> %1736, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1738 = extractelement <4 x float> %1737, i64 0 + %1739 = extractelement <4 x float> %1737, i64 1 + %1740 = extractelement <4 x float> %1737, i64 2 + %1741 = extractelement <4 x float> %1737, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !112 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !113 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !114 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !115 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !116 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !117 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !118 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !119 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !120 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !121 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !122 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !123 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !124 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !125 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !126 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !127 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !128 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !129 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !130 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !131 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !132 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !133 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !134 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !135 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !136 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !137 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !138 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !139 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !140 + tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !141 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !142 + tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !143 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !144 + %1742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1638, <4 x float> %1199, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1639, <4 x float> %1742, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1744 = extractelement <4 x float> %1743, i64 0 + %1745 = extractelement <4 x float> %1743, i64 1 + %1746 = extractelement <4 x float> %1743, i64 2 + %1747 = extractelement <4 x float> %1743, i64 3 + %1748 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1638, <4 x float> %1205, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1749 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1639, <4 x float> %1748, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1750 = extractelement <4 x float> %1749, i64 0 + %1751 = extractelement <4 x float> %1749, i64 1 + %1752 = extractelement <4 x float> %1749, i64 2 + %1753 = extractelement <4 x float> %1749, i64 3 + %1754 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1640, <4 x float> %1211, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1755 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1641, <4 x float> %1754, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1756 = extractelement <4 x float> %1755, i64 0 + %1757 = extractelement <4 x float> %1755, i64 1 + %1758 = extractelement <4 x float> %1755, i64 2 + %1759 = extractelement <4 x float> %1755, i64 3 + %1760 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1640, <4 x float> %1217, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1641, <4 x float> %1760, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1762 = extractelement <4 x float> %1761, i64 0 + %1763 = extractelement <4 x float> %1761, i64 1 + %1764 = extractelement <4 x float> %1761, i64 2 + %1765 = extractelement <4 x float> %1761, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !145 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !146 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %1766 = load <8 x half>, ptr addrspace(3) %233, align 16 + %1767 = load <8 x half>, ptr addrspace(3) %235, align 16 + %1768 = load <8 x half>, ptr addrspace(3) %243, align 16 + %1769 = load <8 x half>, ptr addrspace(3) %245, align 16 + %1770 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1638, <4 x float> %1227, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1771 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1639, <4 x float> %1770, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1772 = extractelement <4 x float> %1771, i64 0 + %1773 = extractelement <4 x float> %1771, i64 1 + %1774 = extractelement <4 x float> %1771, i64 2 + %1775 = extractelement <4 x float> %1771, i64 3 + %1776 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1638, <4 x float> %1233, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1777 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1639, <4 x float> %1776, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1778 = extractelement <4 x float> %1777, i64 0 + %1779 = extractelement <4 x float> %1777, i64 1 + %1780 = extractelement <4 x float> %1777, i64 2 + %1781 = extractelement <4 x float> %1777, i64 3 + %1782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1640, <4 x float> %1239, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1641, <4 x float> %1782, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1784 = extractelement <4 x float> %1783, i64 0 + %1785 = extractelement <4 x float> %1783, i64 1 + %1786 = extractelement <4 x float> %1783, i64 2 + %1787 = extractelement <4 x float> %1783, i64 3 + %1788 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1640, <4 x float> %1245, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1789 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1641, <4 x float> %1788, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1790 = extractelement <4 x float> %1789, i64 0 + %1791 = extractelement <4 x float> %1789, i64 1 + %1792 = extractelement <4 x float> %1789, i64 2 + %1793 = extractelement <4 x float> %1789, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !147 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !148 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !149 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !150 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !151 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !152 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !153 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !154 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !155 + %1794 = load <8 x half>, ptr addrspace(3) %251, align 16 + %1795 = load <8 x half>, ptr addrspace(3) %253, align 16 + %1796 = load <8 x half>, ptr addrspace(3) %258, align 16 + %1797 = load <8 x half>, ptr addrspace(3) %260, align 16 + %1798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1690, <4 x float> %1255, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1691, <4 x float> %1798, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1800 = extractelement <4 x float> %1799, i64 0 + %1801 = extractelement <4 x float> %1799, i64 1 + %1802 = extractelement <4 x float> %1799, i64 2 + %1803 = extractelement <4 x float> %1799, i64 3 + %1804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1690, <4 x float> %1261, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1691, <4 x float> %1804, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1806 = extractelement <4 x float> %1805, i64 0 + %1807 = extractelement <4 x float> %1805, i64 1 + %1808 = extractelement <4 x float> %1805, i64 2 + %1809 = extractelement <4 x float> %1805, i64 3 + %1810 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1692, <4 x float> %1267, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1811 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1693, <4 x float> %1810, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1812 = extractelement <4 x float> %1811, i64 0 + %1813 = extractelement <4 x float> %1811, i64 1 + %1814 = extractelement <4 x float> %1811, i64 2 + %1815 = extractelement <4 x float> %1811, i64 3 + %1816 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1692, <4 x float> %1273, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1817 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1693, <4 x float> %1816, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1818 = extractelement <4 x float> %1817, i64 0 + %1819 = extractelement <4 x float> %1817, i64 1 + %1820 = extractelement <4 x float> %1817, i64 2 + %1821 = extractelement <4 x float> %1817, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !156 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !157 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !158 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !159 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !160 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !161 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !162 + tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !163 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !164 + %1822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1690, <4 x float> %1279, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1691, <4 x float> %1822, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1824 = extractelement <4 x float> %1823, i64 0 + %1825 = extractelement <4 x float> %1823, i64 1 + %1826 = extractelement <4 x float> %1823, i64 2 + %1827 = extractelement <4 x float> %1823, i64 3 + %1828 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1690, <4 x float> %1285, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1829 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1691, <4 x float> %1828, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1830 = extractelement <4 x float> %1829, i64 0 + %1831 = extractelement <4 x float> %1829, i64 1 + %1832 = extractelement <4 x float> %1829, i64 2 + %1833 = extractelement <4 x float> %1829, i64 3 + %1834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1692, <4 x float> %1291, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1693, <4 x float> %1834, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1836 = extractelement <4 x float> %1835, i64 0 + %1837 = extractelement <4 x float> %1835, i64 1 + %1838 = extractelement <4 x float> %1835, i64 2 + %1839 = extractelement <4 x float> %1835, i64 3 + %1840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1692, <4 x float> %1297, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1693, <4 x float> %1840, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %1842 = extractelement <4 x float> %1841, i64 0 + %1843 = extractelement <4 x float> %1841, i64 1 + %1844 = extractelement <4 x float> %1841, i64 2 + %1845 = extractelement <4 x float> %1841, i64 3 + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !165 + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !166 + %1846 = add nuw nsw i32 %769, 1 + %exitcond.not = icmp eq i32 %769, %413 + %1847 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1848 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1849 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1850 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1851 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1852 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1853 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1854 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1855 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1856 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1857 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1858 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1859 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1860 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1861 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1862 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1863 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1864 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1865 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1866 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1867 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1868 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1869 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1870 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1871 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1872 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1873 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1874 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1875 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1876 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1877 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + %1878 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + %1879 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1880 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1881 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1882 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1883 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1884 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1885 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1886 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1887 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1888 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1889 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1890 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1891 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1892 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1893 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1894 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1895 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1896 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1897 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1898 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1899 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1900 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1901 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1902 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1903 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1904 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1905 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1906 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1907 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1908 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1909 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1910 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1911 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1912 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1913 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1914 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1915 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1916 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1917 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1918 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1919 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1920 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1921 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1922 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1923 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1924 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1925 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1926 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1927 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1928 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1929 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1930 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1931 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1932 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1933 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1934 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1935 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1936 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1937 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1938 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1939 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1940 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1941 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + %1942 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + br i1 %exitcond.not, label %._crit_edge, label %510 + +._crit_edge: ; preds = %510, %.._crit_edge_crit_edge + %.pre-phi1068 = phi i32 [ %.pre1067, %.._crit_edge_crit_edge ], [ %410, %510 ] + %.pre-phi1066 = phi i32 [ %.pre1065, %.._crit_edge_crit_edge ], [ %409, %510 ] + %.pre-phi1064 = phi i32 [ %.pre1063, %.._crit_edge_crit_edge ], [ %406, %510 ] + %.pre-phi1062 = phi i32 [ %.pre1061, %.._crit_edge_crit_edge ], [ %405, %510 ] + %.pre-phi1060 = phi i32 [ %.pre1059, %.._crit_edge_crit_edge ], [ %402, %510 ] + %.pre-phi1058 = phi i32 [ %.pre1057, %.._crit_edge_crit_edge ], [ %401, %510 ] + %.pre-phi1056 = phi i32 [ %.pre1055, %.._crit_edge_crit_edge ], [ %398, %510 ] + %.pre-phi1054 = phi i32 [ %.pre1053, %.._crit_edge_crit_edge ], [ %397, %510 ] + %.pre-phi1052 = phi i32 [ %.pre1051, %.._crit_edge_crit_edge ], [ %394, %510 ] + %.pre-phi1050 = phi i32 [ %.pre1049, %.._crit_edge_crit_edge ], [ %393, %510 ] + %.pre-phi1048 = phi i32 [ %.pre1047, %.._crit_edge_crit_edge ], [ %390, %510 ] + %.pre-phi1046 = phi i32 [ %.pre1045, %.._crit_edge_crit_edge ], [ %384, %510 ] + %.pre-phi1044 = phi i32 [ %.pre1043, %.._crit_edge_crit_edge ], [ %383, %510 ] + %.pre-phi1042 = phi i32 [ %.pre1041, %.._crit_edge_crit_edge ], [ %387, %510 ] + %.pre-phi1034 = phi i32 [ %.pre1033, %.._crit_edge_crit_edge ], [ %377, %510 ] + %.pre-phi1030 = phi i32 [ %.pre1029, %.._crit_edge_crit_edge ], [ %375, %510 ] + %.pre-phi1026 = phi i32 [ %.pre1025, %.._crit_edge_crit_edge ], [ %371, %510 ] + %.pre-phi1022 = phi i32 [ %.pre1021, %.._crit_edge_crit_edge ], [ %369, %510 ] + %.pre-phi1018 = phi i32 [ %.pre1017, %.._crit_edge_crit_edge ], [ %365, %510 ] + %.pre-phi1014 = phi i32 [ %.pre1013, %.._crit_edge_crit_edge ], [ %361, %510 ] + %1943 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1308, %510 ], !dbg !167 + %1944 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1309, %510 ], !dbg !167 + %1945 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1310, %510 ], !dbg !167 + %1946 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1311, %510 ], !dbg !167 + %1947 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1314, %510 ], !dbg !167 + %1948 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1315, %510 ], !dbg !167 + %1949 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1316, %510 ], !dbg !167 + %1950 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1317, %510 ], !dbg !167 + %1951 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1320, %510 ], !dbg !167 + %1952 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1321, %510 ], !dbg !167 + %1953 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1322, %510 ], !dbg !167 + %1954 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1323, %510 ], !dbg !167 + %1955 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1326, %510 ], !dbg !167 + %1956 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1327, %510 ], !dbg !167 + %1957 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1328, %510 ], !dbg !167 + %1958 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1329, %510 ], !dbg !167 + %1959 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1340, %510 ], !dbg !168 + %1960 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1341, %510 ], !dbg !168 + %1961 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1342, %510 ], !dbg !168 + %1962 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1343, %510 ], !dbg !168 + %1963 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1346, %510 ], !dbg !168 + %1964 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1347, %510 ], !dbg !168 + %1965 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1348, %510 ], !dbg !168 + %1966 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1349, %510 ], !dbg !168 + %1967 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1352, %510 ], !dbg !168 + %1968 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1353, %510 ], !dbg !168 + %1969 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1354, %510 ], !dbg !168 + %1970 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1355, %510 ], !dbg !168 + %1971 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1358, %510 ], !dbg !168 + %1972 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1359, %510 ], !dbg !168 + %1973 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1360, %510 ], !dbg !168 + %1974 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1361, %510 ], !dbg !168 + %1975 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1482, %510 ], !dbg !169 + %1976 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1483, %510 ], !dbg !169 + %1977 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1484, %510 ], !dbg !169 + %1978 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1485, %510 ], !dbg !169 + %1979 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1488, %510 ], !dbg !169 + %1980 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1489, %510 ], !dbg !169 + %1981 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1490, %510 ], !dbg !169 + %1982 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1491, %510 ], !dbg !169 + %1983 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1494, %510 ], !dbg !169 + %1984 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1495, %510 ], !dbg !169 + %1985 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1496, %510 ], !dbg !169 + %1986 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1497, %510 ], !dbg !169 + %1987 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1500, %510 ], !dbg !169 + %1988 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1501, %510 ], !dbg !169 + %1989 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1502, %510 ], !dbg !169 + %1990 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1503, %510 ], !dbg !169 + %1991 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1510, %510 ], !dbg !170 + %1992 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1511, %510 ], !dbg !170 + %1993 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1512, %510 ], !dbg !170 + %1994 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1513, %510 ], !dbg !170 + %1995 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1516, %510 ], !dbg !170 + %1996 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1517, %510 ], !dbg !170 + %1997 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1518, %510 ], !dbg !170 + %1998 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1519, %510 ], !dbg !170 + %1999 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1522, %510 ], !dbg !170 + %2000 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1523, %510 ], !dbg !170 + %2001 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1524, %510 ], !dbg !170 + %2002 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1525, %510 ], !dbg !170 + %2003 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1528, %510 ], !dbg !170 + %2004 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1529, %510 ], !dbg !170 + %2005 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1530, %510 ], !dbg !170 + %2006 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1531, %510 ], !dbg !170 + %2007 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1372, %510 ], !dbg !171 + %2008 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1373, %510 ], !dbg !171 + %2009 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1374, %510 ], !dbg !171 + %2010 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1375, %510 ], !dbg !171 + %2011 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1378, %510 ], !dbg !171 + %2012 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1379, %510 ], !dbg !171 + %2013 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1380, %510 ], !dbg !171 + %2014 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1381, %510 ], !dbg !171 + %2015 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1384, %510 ], !dbg !171 + %2016 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1385, %510 ], !dbg !171 + %2017 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1386, %510 ], !dbg !171 + %2018 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1387, %510 ], !dbg !171 + %2019 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1390, %510 ], !dbg !171 + %2020 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1391, %510 ], !dbg !171 + %2021 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1392, %510 ], !dbg !171 + %2022 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1393, %510 ], !dbg !171 + %2023 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1396, %510 ], !dbg !172 + %2024 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1397, %510 ], !dbg !172 + %2025 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1398, %510 ], !dbg !172 + %2026 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1399, %510 ], !dbg !172 + %2027 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1402, %510 ], !dbg !172 + %2028 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1403, %510 ], !dbg !172 + %2029 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1404, %510 ], !dbg !172 + %2030 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1405, %510 ], !dbg !172 + %2031 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1408, %510 ], !dbg !172 + %2032 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1409, %510 ], !dbg !172 + %2033 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1410, %510 ], !dbg !172 + %2034 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1411, %510 ], !dbg !172 + %2035 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1414, %510 ], !dbg !172 + %2036 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1415, %510 ], !dbg !172 + %2037 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1416, %510 ], !dbg !172 + %2038 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1417, %510 ], !dbg !172 + %2039 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1534, %510 ], !dbg !173 + %2040 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1535, %510 ], !dbg !173 + %2041 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1536, %510 ], !dbg !173 + %2042 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1537, %510 ], !dbg !173 + %2043 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1540, %510 ], !dbg !173 + %2044 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1541, %510 ], !dbg !173 + %2045 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1542, %510 ], !dbg !173 + %2046 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1543, %510 ], !dbg !173 + %2047 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1546, %510 ], !dbg !173 + %2048 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1547, %510 ], !dbg !173 + %2049 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1548, %510 ], !dbg !173 + %2050 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1549, %510 ], !dbg !173 + %2051 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1552, %510 ], !dbg !173 + %2052 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1553, %510 ], !dbg !173 + %2053 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1554, %510 ], !dbg !173 + %2054 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1555, %510 ], !dbg !173 + %2055 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1558, %510 ], !dbg !174 + %2056 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1559, %510 ], !dbg !174 + %2057 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1560, %510 ], !dbg !174 + %2058 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1561, %510 ], !dbg !174 + %2059 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1564, %510 ], !dbg !174 + %2060 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1565, %510 ], !dbg !174 + %2061 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1566, %510 ], !dbg !174 + %2062 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1567, %510 ], !dbg !174 + %2063 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1570, %510 ], !dbg !174 + %2064 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1571, %510 ], !dbg !174 + %2065 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1572, %510 ], !dbg !174 + %2066 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1573, %510 ], !dbg !174 + %2067 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1576, %510 ], !dbg !174 + %2068 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1577, %510 ], !dbg !174 + %2069 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1578, %510 ], !dbg !174 + %2070 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1579, %510 ], !dbg !174 + %2071 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1644, %510 ], !dbg !175 + %2072 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1645, %510 ], !dbg !175 + %2073 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1646, %510 ], !dbg !175 + %2074 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1647, %510 ], !dbg !175 + %2075 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1650, %510 ], !dbg !175 + %2076 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1651, %510 ], !dbg !175 + %2077 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1652, %510 ], !dbg !175 + %2078 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1653, %510 ], !dbg !175 + %2079 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1656, %510 ], !dbg !175 + %2080 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1657, %510 ], !dbg !175 + %2081 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1658, %510 ], !dbg !175 + %2082 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1659, %510 ], !dbg !175 + %2083 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1662, %510 ], !dbg !175 + %2084 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1663, %510 ], !dbg !175 + %2085 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1664, %510 ], !dbg !175 + %2086 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1665, %510 ], !dbg !175 + %2087 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1668, %510 ], !dbg !176 + %2088 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1669, %510 ], !dbg !176 + %2089 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1670, %510 ], !dbg !176 + %2090 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1671, %510 ], !dbg !176 + %2091 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1674, %510 ], !dbg !176 + %2092 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1675, %510 ], !dbg !176 + %2093 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1676, %510 ], !dbg !176 + %2094 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1677, %510 ], !dbg !176 + %2095 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1680, %510 ], !dbg !176 + %2096 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1681, %510 ], !dbg !176 + %2097 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1682, %510 ], !dbg !176 + %2098 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1683, %510 ], !dbg !176 + %2099 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1686, %510 ], !dbg !176 + %2100 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1687, %510 ], !dbg !176 + %2101 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1688, %510 ], !dbg !176 + %2102 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1689, %510 ], !dbg !176 + %2103 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1744, %510 ], !dbg !177 + %2104 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1745, %510 ], !dbg !177 + %2105 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1746, %510 ], !dbg !177 + %2106 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1747, %510 ], !dbg !177 + %2107 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1750, %510 ], !dbg !177 + %2108 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1751, %510 ], !dbg !177 + %2109 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1752, %510 ], !dbg !177 + %2110 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1753, %510 ], !dbg !177 + %2111 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1756, %510 ], !dbg !177 + %2112 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1757, %510 ], !dbg !177 + %2113 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1758, %510 ], !dbg !177 + %2114 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1759, %510 ], !dbg !177 + %2115 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1762, %510 ], !dbg !177 + %2116 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1763, %510 ], !dbg !177 + %2117 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1764, %510 ], !dbg !177 + %2118 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1765, %510 ], !dbg !177 + %2119 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1772, %510 ], !dbg !178 + %2120 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1773, %510 ], !dbg !178 + %2121 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1774, %510 ], !dbg !178 + %2122 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1775, %510 ], !dbg !178 + %2123 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1778, %510 ], !dbg !178 + %2124 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1779, %510 ], !dbg !178 + %2125 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1780, %510 ], !dbg !178 + %2126 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1781, %510 ], !dbg !178 + %2127 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1784, %510 ], !dbg !178 + %2128 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1785, %510 ], !dbg !178 + %2129 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1786, %510 ], !dbg !178 + %2130 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1787, %510 ], !dbg !178 + %2131 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1790, %510 ], !dbg !178 + %2132 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1791, %510 ], !dbg !178 + %2133 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1792, %510 ], !dbg !178 + %2134 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1793, %510 ], !dbg !178 + %2135 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1696, %510 ], !dbg !179 + %2136 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1697, %510 ], !dbg !179 + %2137 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1698, %510 ], !dbg !179 + %2138 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1699, %510 ], !dbg !179 + %2139 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1702, %510 ], !dbg !179 + %2140 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1703, %510 ], !dbg !179 + %2141 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1704, %510 ], !dbg !179 + %2142 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1705, %510 ], !dbg !179 + %2143 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1708, %510 ], !dbg !179 + %2144 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1709, %510 ], !dbg !179 + %2145 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1710, %510 ], !dbg !179 + %2146 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1711, %510 ], !dbg !179 + %2147 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1714, %510 ], !dbg !179 + %2148 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1715, %510 ], !dbg !179 + %2149 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1716, %510 ], !dbg !179 + %2150 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1717, %510 ], !dbg !179 + %2151 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1720, %510 ], !dbg !180 + %2152 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1721, %510 ], !dbg !180 + %2153 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1722, %510 ], !dbg !180 + %2154 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1723, %510 ], !dbg !180 + %2155 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1726, %510 ], !dbg !180 + %2156 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1727, %510 ], !dbg !180 + %2157 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1728, %510 ], !dbg !180 + %2158 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1729, %510 ], !dbg !180 + %2159 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1732, %510 ], !dbg !180 + %2160 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1733, %510 ], !dbg !180 + %2161 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1734, %510 ], !dbg !180 + %2162 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1735, %510 ], !dbg !180 + %2163 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1738, %510 ], !dbg !180 + %2164 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1739, %510 ], !dbg !180 + %2165 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1740, %510 ], !dbg !180 + %2166 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1741, %510 ], !dbg !180 + %2167 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1800, %510 ], !dbg !181 + %2168 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1801, %510 ], !dbg !181 + %2169 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1802, %510 ], !dbg !181 + %2170 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1803, %510 ], !dbg !181 + %2171 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1806, %510 ], !dbg !181 + %2172 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1807, %510 ], !dbg !181 + %2173 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1808, %510 ], !dbg !181 + %2174 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1809, %510 ], !dbg !181 + %2175 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1812, %510 ], !dbg !181 + %2176 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1813, %510 ], !dbg !181 + %2177 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1814, %510 ], !dbg !181 + %2178 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1815, %510 ], !dbg !181 + %2179 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1818, %510 ], !dbg !181 + %2180 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1819, %510 ], !dbg !181 + %2181 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1820, %510 ], !dbg !181 + %2182 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1821, %510 ], !dbg !181 + %2183 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1824, %510 ], !dbg !182 + %2184 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1825, %510 ], !dbg !182 + %2185 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1826, %510 ], !dbg !182 + %2186 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1827, %510 ], !dbg !182 + %2187 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1830, %510 ], !dbg !182 + %2188 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1831, %510 ], !dbg !182 + %2189 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1832, %510 ], !dbg !182 + %2190 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1833, %510 ], !dbg !182 + %2191 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1836, %510 ], !dbg !182 + %2192 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1837, %510 ], !dbg !182 + %2193 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1838, %510 ], !dbg !182 + %2194 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1839, %510 ], !dbg !182 + %2195 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1842, %510 ], !dbg !182 + %2196 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1843, %510 ], !dbg !182 + %2197 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1844, %510 ], !dbg !182 + %2198 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1845, %510 ], !dbg !182 + %2199 = phi <2 x half> [ %263, %.._crit_edge_crit_edge ], [ %1847, %510 ] + %2200 = phi <2 x half> [ %264, %.._crit_edge_crit_edge ], [ %1848, %510 ] + %2201 = phi <2 x half> [ %265, %.._crit_edge_crit_edge ], [ %1849, %510 ] + %2202 = phi <2 x half> [ %266, %.._crit_edge_crit_edge ], [ %1850, %510 ] + %2203 = phi <2 x half> [ %267, %.._crit_edge_crit_edge ], [ %1851, %510 ] + %2204 = phi <2 x half> [ %268, %.._crit_edge_crit_edge ], [ %1852, %510 ] + %2205 = phi <2 x half> [ %269, %.._crit_edge_crit_edge ], [ %1853, %510 ] + %2206 = phi <2 x half> [ %270, %.._crit_edge_crit_edge ], [ %1854, %510 ] + %2207 = phi <2 x half> [ %271, %.._crit_edge_crit_edge ], [ %1855, %510 ] + %2208 = phi <2 x half> [ %272, %.._crit_edge_crit_edge ], [ %1856, %510 ] + %2209 = phi <2 x half> [ %273, %.._crit_edge_crit_edge ], [ %1857, %510 ] + %2210 = phi <2 x half> [ %274, %.._crit_edge_crit_edge ], [ %1858, %510 ] + %2211 = phi <2 x half> [ %275, %.._crit_edge_crit_edge ], [ %1859, %510 ] + %2212 = phi <2 x half> [ %276, %.._crit_edge_crit_edge ], [ %1860, %510 ] + %2213 = phi <2 x half> [ %277, %.._crit_edge_crit_edge ], [ %1861, %510 ] + %2214 = phi <2 x half> [ %278, %.._crit_edge_crit_edge ], [ %1862, %510 ] + %2215 = phi <2 x half> [ %279, %.._crit_edge_crit_edge ], [ %1863, %510 ] + %2216 = phi <2 x half> [ %280, %.._crit_edge_crit_edge ], [ %1864, %510 ] + %2217 = phi <2 x half> [ %281, %.._crit_edge_crit_edge ], [ %1865, %510 ] + %2218 = phi <2 x half> [ %282, %.._crit_edge_crit_edge ], [ %1866, %510 ] + %2219 = phi <2 x half> [ %283, %.._crit_edge_crit_edge ], [ %1867, %510 ] + %2220 = phi <2 x half> [ %284, %.._crit_edge_crit_edge ], [ %1868, %510 ] + %2221 = phi <2 x half> [ %285, %.._crit_edge_crit_edge ], [ %1869, %510 ] + %2222 = phi <2 x half> [ %286, %.._crit_edge_crit_edge ], [ %1870, %510 ] + %2223 = phi <2 x half> [ %287, %.._crit_edge_crit_edge ], [ %1871, %510 ] + %2224 = phi <2 x half> [ %288, %.._crit_edge_crit_edge ], [ %1872, %510 ] + %2225 = phi <2 x half> [ %289, %.._crit_edge_crit_edge ], [ %1873, %510 ] + %2226 = phi <2 x half> [ %290, %.._crit_edge_crit_edge ], [ %1874, %510 ] + %2227 = phi <2 x half> [ %291, %.._crit_edge_crit_edge ], [ %1875, %510 ] + %2228 = phi <2 x half> [ %292, %.._crit_edge_crit_edge ], [ %1876, %510 ] + %2229 = phi <2 x half> [ %293, %.._crit_edge_crit_edge ], [ %1877, %510 ] + %2230 = phi <2 x half> [ %294, %.._crit_edge_crit_edge ], [ %1878, %510 ] + %2231 = phi <2 x half> [ %295, %.._crit_edge_crit_edge ], [ %1879, %510 ] + %2232 = phi <2 x half> [ %296, %.._crit_edge_crit_edge ], [ %1880, %510 ] + %2233 = phi <2 x half> [ %297, %.._crit_edge_crit_edge ], [ %1881, %510 ] + %2234 = phi <2 x half> [ %298, %.._crit_edge_crit_edge ], [ %1882, %510 ] + %2235 = phi <2 x half> [ %299, %.._crit_edge_crit_edge ], [ %1883, %510 ] + %2236 = phi <2 x half> [ %300, %.._crit_edge_crit_edge ], [ %1884, %510 ] + %2237 = phi <2 x half> [ %301, %.._crit_edge_crit_edge ], [ %1885, %510 ] + %2238 = phi <2 x half> [ %302, %.._crit_edge_crit_edge ], [ %1886, %510 ] + %2239 = phi <2 x half> [ %303, %.._crit_edge_crit_edge ], [ %1887, %510 ] + %2240 = phi <2 x half> [ %304, %.._crit_edge_crit_edge ], [ %1888, %510 ] + %2241 = phi <2 x half> [ %305, %.._crit_edge_crit_edge ], [ %1889, %510 ] + %2242 = phi <2 x half> [ %306, %.._crit_edge_crit_edge ], [ %1890, %510 ] + %2243 = phi <2 x half> [ %307, %.._crit_edge_crit_edge ], [ %1891, %510 ] + %2244 = phi <2 x half> [ %308, %.._crit_edge_crit_edge ], [ %1892, %510 ] + %2245 = phi <2 x half> [ %309, %.._crit_edge_crit_edge ], [ %1893, %510 ] + %2246 = phi <2 x half> [ %310, %.._crit_edge_crit_edge ], [ %1894, %510 ] + %2247 = phi <2 x half> [ %311, %.._crit_edge_crit_edge ], [ %1895, %510 ] + %2248 = phi <2 x half> [ %312, %.._crit_edge_crit_edge ], [ %1896, %510 ] + %2249 = phi <2 x half> [ %313, %.._crit_edge_crit_edge ], [ %1897, %510 ] + %2250 = phi <2 x half> [ %314, %.._crit_edge_crit_edge ], [ %1898, %510 ] + %2251 = phi <2 x half> [ %315, %.._crit_edge_crit_edge ], [ %1899, %510 ] + %2252 = phi <2 x half> [ %316, %.._crit_edge_crit_edge ], [ %1900, %510 ] + %2253 = phi <2 x half> [ %317, %.._crit_edge_crit_edge ], [ %1901, %510 ] + %2254 = phi <2 x half> [ %318, %.._crit_edge_crit_edge ], [ %1902, %510 ] + %2255 = phi <2 x half> [ %319, %.._crit_edge_crit_edge ], [ %1903, %510 ] + %2256 = phi <2 x half> [ %320, %.._crit_edge_crit_edge ], [ %1904, %510 ] + %2257 = phi <2 x half> [ %321, %.._crit_edge_crit_edge ], [ %1905, %510 ] + %2258 = phi <2 x half> [ %322, %.._crit_edge_crit_edge ], [ %1906, %510 ] + %2259 = phi <2 x half> [ %323, %.._crit_edge_crit_edge ], [ %1907, %510 ] + %2260 = phi <2 x half> [ %324, %.._crit_edge_crit_edge ], [ %1908, %510 ] + %2261 = phi <2 x half> [ %325, %.._crit_edge_crit_edge ], [ %1909, %510 ] + %2262 = phi <2 x half> [ %326, %.._crit_edge_crit_edge ], [ %1910, %510 ] + %2263 = phi <2 x half> [ %327, %.._crit_edge_crit_edge ], [ %1911, %510 ] + %2264 = phi <2 x half> [ %328, %.._crit_edge_crit_edge ], [ %1912, %510 ] + %2265 = phi <2 x half> [ %329, %.._crit_edge_crit_edge ], [ %1913, %510 ] + %2266 = phi <2 x half> [ %330, %.._crit_edge_crit_edge ], [ %1914, %510 ] + %2267 = phi <2 x half> [ %331, %.._crit_edge_crit_edge ], [ %1915, %510 ] + %2268 = phi <2 x half> [ %332, %.._crit_edge_crit_edge ], [ %1916, %510 ] + %2269 = phi <2 x half> [ %333, %.._crit_edge_crit_edge ], [ %1917, %510 ] + %2270 = phi <2 x half> [ %334, %.._crit_edge_crit_edge ], [ %1918, %510 ] + %2271 = phi <2 x half> [ %335, %.._crit_edge_crit_edge ], [ %1919, %510 ] + %2272 = phi <2 x half> [ %336, %.._crit_edge_crit_edge ], [ %1920, %510 ] + %2273 = phi <2 x half> [ %337, %.._crit_edge_crit_edge ], [ %1921, %510 ] + %2274 = phi <2 x half> [ %338, %.._crit_edge_crit_edge ], [ %1922, %510 ] + %2275 = phi <2 x half> [ %339, %.._crit_edge_crit_edge ], [ %1923, %510 ] + %2276 = phi <2 x half> [ %340, %.._crit_edge_crit_edge ], [ %1924, %510 ] + %2277 = phi <2 x half> [ %341, %.._crit_edge_crit_edge ], [ %1925, %510 ] + %2278 = phi <2 x half> [ %342, %.._crit_edge_crit_edge ], [ %1926, %510 ] + %2279 = phi <2 x half> [ %343, %.._crit_edge_crit_edge ], [ %1927, %510 ] + %2280 = phi <2 x half> [ %344, %.._crit_edge_crit_edge ], [ %1928, %510 ] + %2281 = phi <2 x half> [ %345, %.._crit_edge_crit_edge ], [ %1929, %510 ] + %2282 = phi <2 x half> [ %346, %.._crit_edge_crit_edge ], [ %1930, %510 ] + %2283 = phi <2 x half> [ %347, %.._crit_edge_crit_edge ], [ %1931, %510 ] + %2284 = phi <2 x half> [ %348, %.._crit_edge_crit_edge ], [ %1932, %510 ] + %2285 = phi <2 x half> [ %349, %.._crit_edge_crit_edge ], [ %1933, %510 ] + %2286 = phi <2 x half> [ %350, %.._crit_edge_crit_edge ], [ %1934, %510 ] + %2287 = phi <2 x half> [ %351, %.._crit_edge_crit_edge ], [ %1935, %510 ] + %2288 = phi <2 x half> [ %352, %.._crit_edge_crit_edge ], [ %1936, %510 ] + %2289 = phi <2 x half> [ %353, %.._crit_edge_crit_edge ], [ %1937, %510 ] + %2290 = phi <2 x half> [ %354, %.._crit_edge_crit_edge ], [ %1938, %510 ] + %2291 = phi <2 x half> [ %355, %.._crit_edge_crit_edge ], [ %1939, %510 ] + %2292 = phi <2 x half> [ %356, %.._crit_edge_crit_edge ], [ %1940, %510 ] + %2293 = phi <2 x half> [ %357, %.._crit_edge_crit_edge ], [ %1941, %510 ] + %2294 = phi <2 x half> [ %358, %.._crit_edge_crit_edge ], [ %1942, %510 ] + %2295 = and i32 %237, 28 + %2296 = or disjoint i32 %2295, 224 + %2297 = or disjoint i32 %2295, 192 + %2298 = or disjoint i32 %2295, 160 + %2299 = or disjoint i32 %2295, 128 + %2300 = or disjoint i32 %2295, 96 + %2301 = or disjoint i32 %2295, 64 + %2302 = or disjoint i32 %2295, 32 + %2303 = or disjoint i32 %225, 224 + %2304 = or disjoint i32 %225, 192 + %2305 = or disjoint i32 %225, 160 + %2306 = or disjoint i32 %225, 128 + %2307 = or disjoint i32 %225, 96 + %2308 = or disjoint i32 %225, 64 + %2309 = or disjoint i32 %225, 32 + %2310 = shufflevector <2 x half> %2231, <2 x half> %2232, <4 x i32> + %2311 = shufflevector <2 x half> %2233, <2 x half> %2234, <4 x i32> + %2312 = shufflevector <2 x half> %2235, <2 x half> %2236, <4 x i32> + %2313 = shufflevector <2 x half> %2237, <2 x half> %2238, <4 x i32> + %2314 = shufflevector <2 x half> %2239, <2 x half> %2240, <4 x i32> + %2315 = shufflevector <2 x half> %2241, <2 x half> %2242, <4 x i32> + %2316 = shufflevector <2 x half> %2243, <2 x half> %2244, <4 x i32> + %2317 = shufflevector <2 x half> %2245, <2 x half> %2246, <4 x i32> + %2318 = insertelement <4 x float> poison, float %1943, i64 0 + %2319 = insertelement <4 x float> %2318, float %1944, i64 1 + %2320 = insertelement <4 x float> %2319, float %1945, i64 2 + %2321 = insertelement <4 x float> %2320, float %1946, i64 3 + %2322 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2310, <4 x float> %2321, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2323 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2311, <4 x float> %2322, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2324 = insertelement <4 x float> poison, float %1947, i64 0 + %2325 = insertelement <4 x float> %2324, float %1948, i64 1 + %2326 = insertelement <4 x float> %2325, float %1949, i64 2 + %2327 = insertelement <4 x float> %2326, float %1950, i64 3 + %2328 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2310, <4 x float> %2327, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2329 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2311, <4 x float> %2328, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2330 = insertelement <4 x float> poison, float %1951, i64 0 + %2331 = insertelement <4 x float> %2330, float %1952, i64 1 + %2332 = insertelement <4 x float> %2331, float %1953, i64 2 + %2333 = insertelement <4 x float> %2332, float %1954, i64 3 + %2334 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2312, <4 x float> %2333, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2335 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2313, <4 x float> %2334, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2336 = insertelement <4 x float> poison, float %1955, i64 0 + %2337 = insertelement <4 x float> %2336, float %1956, i64 1 + %2338 = insertelement <4 x float> %2337, float %1957, i64 2 + %2339 = insertelement <4 x float> %2338, float %1958, i64 3 + %2340 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2312, <4 x float> %2339, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2341 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2313, <4 x float> %2340, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2342 = shufflevector <2 x half> %2255, <2 x half> %2256, <4 x i32> + %2343 = shufflevector <2 x half> %2257, <2 x half> %2258, <4 x i32> + %2344 = shufflevector <2 x half> %2259, <2 x half> %2260, <4 x i32> + %2345 = shufflevector <2 x half> %2261, <2 x half> %2262, <4 x i32> + %2346 = insertelement <4 x float> poison, float %1959, i64 0 + %2347 = insertelement <4 x float> %2346, float %1960, i64 1 + %2348 = insertelement <4 x float> %2347, float %1961, i64 2 + %2349 = insertelement <4 x float> %2348, float %1962, i64 3 + %2350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2310, <4 x float> %2349, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2311, <4 x float> %2350, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2352 = insertelement <4 x float> poison, float %1963, i64 0 + %2353 = insertelement <4 x float> %2352, float %1964, i64 1 + %2354 = insertelement <4 x float> %2353, float %1965, i64 2 + %2355 = insertelement <4 x float> %2354, float %1966, i64 3 + %2356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2310, <4 x float> %2355, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2311, <4 x float> %2356, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2358 = insertelement <4 x float> poison, float %1967, i64 0 + %2359 = insertelement <4 x float> %2358, float %1968, i64 1 + %2360 = insertelement <4 x float> %2359, float %1969, i64 2 + %2361 = insertelement <4 x float> %2360, float %1970, i64 3 + %2362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2312, <4 x float> %2361, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2313, <4 x float> %2362, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2364 = insertelement <4 x float> poison, float %1971, i64 0 + %2365 = insertelement <4 x float> %2364, float %1972, i64 1 + %2366 = insertelement <4 x float> %2365, float %1973, i64 2 + %2367 = insertelement <4 x float> %2366, float %1974, i64 3 + %2368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2312, <4 x float> %2367, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2313, <4 x float> %2368, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2370 = shufflevector <2 x half> %2247, <2 x half> %2248, <4 x i32> + %2371 = shufflevector <2 x half> %2249, <2 x half> %2250, <4 x i32> + %2372 = shufflevector <2 x half> %2251, <2 x half> %2252, <4 x i32> + %2373 = shufflevector <2 x half> %2253, <2 x half> %2254, <4 x i32> + %2374 = insertelement <4 x float> poison, float %2007, i64 0 + %2375 = insertelement <4 x float> %2374, float %2008, i64 1 + %2376 = insertelement <4 x float> %2375, float %2009, i64 2 + %2377 = insertelement <4 x float> %2376, float %2010, i64 3 + %2378 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2370, <4 x float> %2377, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2379 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2371, <4 x float> %2378, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2380 = insertelement <4 x float> poison, float %2011, i64 0 + %2381 = insertelement <4 x float> %2380, float %2012, i64 1 + %2382 = insertelement <4 x float> %2381, float %2013, i64 2 + %2383 = insertelement <4 x float> %2382, float %2014, i64 3 + %2384 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2370, <4 x float> %2383, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2385 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2371, <4 x float> %2384, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2386 = insertelement <4 x float> poison, float %2015, i64 0 + %2387 = insertelement <4 x float> %2386, float %2016, i64 1 + %2388 = insertelement <4 x float> %2387, float %2017, i64 2 + %2389 = insertelement <4 x float> %2388, float %2018, i64 3 + %2390 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2372, <4 x float> %2389, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2391 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2373, <4 x float> %2390, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2392 = insertelement <4 x float> poison, float %2019, i64 0 + %2393 = insertelement <4 x float> %2392, float %2020, i64 1 + %2394 = insertelement <4 x float> %2393, float %2021, i64 2 + %2395 = insertelement <4 x float> %2394, float %2022, i64 3 + %2396 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2372, <4 x float> %2395, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2397 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2373, <4 x float> %2396, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2398 = insertelement <4 x float> poison, float %2023, i64 0 + %2399 = insertelement <4 x float> %2398, float %2024, i64 1 + %2400 = insertelement <4 x float> %2399, float %2025, i64 2 + %2401 = insertelement <4 x float> %2400, float %2026, i64 3 + %2402 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2370, <4 x float> %2401, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2403 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2371, <4 x float> %2402, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2404 = insertelement <4 x float> poison, float %2027, i64 0 + %2405 = insertelement <4 x float> %2404, float %2028, i64 1 + %2406 = insertelement <4 x float> %2405, float %2029, i64 2 + %2407 = insertelement <4 x float> %2406, float %2030, i64 3 + %2408 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2370, <4 x float> %2407, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2409 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2371, <4 x float> %2408, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2410 = insertelement <4 x float> poison, float %2031, i64 0 + %2411 = insertelement <4 x float> %2410, float %2032, i64 1 + %2412 = insertelement <4 x float> %2411, float %2033, i64 2 + %2413 = insertelement <4 x float> %2412, float %2034, i64 3 + %2414 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2372, <4 x float> %2413, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2415 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2373, <4 x float> %2414, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2416 = insertelement <4 x float> poison, float %2035, i64 0 + %2417 = insertelement <4 x float> %2416, float %2036, i64 1 + %2418 = insertelement <4 x float> %2417, float %2037, i64 2 + %2419 = insertelement <4 x float> %2418, float %2038, i64 3 + %2420 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2372, <4 x float> %2419, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2421 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2373, <4 x float> %2420, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !111 + %2422 = or disjoint i32 %.pre-phi1014, 2048 + %2423 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1014 + %2424 = load <8 x half>, ptr addrspace(3) %2423, align 16 + %2425 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2422 + %2426 = load <8 x half>, ptr addrspace(3) %2425, align 16 + %2427 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> + %2428 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> + %2429 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> + %2430 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> + %2431 = insertelement <4 x float> poison, float %1975, i64 0 + %2432 = insertelement <4 x float> %2431, float %1976, i64 1 + %2433 = insertelement <4 x float> %2432, float %1977, i64 2 + %2434 = insertelement <4 x float> %2433, float %1978, i64 3 + %2435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2310, <4 x float> %2434, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2436 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2311, <4 x float> %2435, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2437 = insertelement <4 x float> poison, float %1979, i64 0 + %2438 = insertelement <4 x float> %2437, float %1980, i64 1 + %2439 = insertelement <4 x float> %2438, float %1981, i64 2 + %2440 = insertelement <4 x float> %2439, float %1982, i64 3 + %2441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2310, <4 x float> %2440, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2442 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2311, <4 x float> %2441, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2443 = insertelement <4 x float> poison, float %1983, i64 0 + %2444 = insertelement <4 x float> %2443, float %1984, i64 1 + %2445 = insertelement <4 x float> %2444, float %1985, i64 2 + %2446 = insertelement <4 x float> %2445, float %1986, i64 3 + %2447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2312, <4 x float> %2446, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2448 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2313, <4 x float> %2447, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2449 = insertelement <4 x float> poison, float %1987, i64 0 + %2450 = insertelement <4 x float> %2449, float %1988, i64 1 + %2451 = insertelement <4 x float> %2450, float %1989, i64 2 + %2452 = insertelement <4 x float> %2451, float %1990, i64 3 + %2453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2312, <4 x float> %2452, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2454 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2313, <4 x float> %2453, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2455 = or disjoint i32 %.pre-phi1018, 2048 + %2456 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1018 + %2457 = load <8 x half>, ptr addrspace(3) %2456, align 16 + %2458 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2455 + %2459 = load <8 x half>, ptr addrspace(3) %2458, align 16 + %2460 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> + %2461 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> + %2462 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> + %2463 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> + %2464 = insertelement <4 x float> poison, float %1991, i64 0 + %2465 = insertelement <4 x float> %2464, float %1992, i64 1 + %2466 = insertelement <4 x float> %2465, float %1993, i64 2 + %2467 = insertelement <4 x float> %2466, float %1994, i64 3 + %2468 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2310, <4 x float> %2467, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2469 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2311, <4 x float> %2468, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2470 = insertelement <4 x float> poison, float %1995, i64 0 + %2471 = insertelement <4 x float> %2470, float %1996, i64 1 + %2472 = insertelement <4 x float> %2471, float %1997, i64 2 + %2473 = insertelement <4 x float> %2472, float %1998, i64 3 + %2474 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2310, <4 x float> %2473, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2475 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2311, <4 x float> %2474, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2476 = insertelement <4 x float> poison, float %1999, i64 0 + %2477 = insertelement <4 x float> %2476, float %2000, i64 1 + %2478 = insertelement <4 x float> %2477, float %2001, i64 2 + %2479 = insertelement <4 x float> %2478, float %2002, i64 3 + %2480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2312, <4 x float> %2479, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2313, <4 x float> %2480, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2482 = insertelement <4 x float> poison, float %2003, i64 0 + %2483 = insertelement <4 x float> %2482, float %2004, i64 1 + %2484 = insertelement <4 x float> %2483, float %2005, i64 2 + %2485 = insertelement <4 x float> %2484, float %2006, i64 3 + %2486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2312, <4 x float> %2485, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2313, <4 x float> %2486, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2488 = insertelement <4 x float> poison, float %2039, i64 0 + %2489 = insertelement <4 x float> %2488, float %2040, i64 1 + %2490 = insertelement <4 x float> %2489, float %2041, i64 2 + %2491 = insertelement <4 x float> %2490, float %2042, i64 3 + %2492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2370, <4 x float> %2491, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2371, <4 x float> %2492, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2494 = insertelement <4 x float> poison, float %2043, i64 0 + %2495 = insertelement <4 x float> %2494, float %2044, i64 1 + %2496 = insertelement <4 x float> %2495, float %2045, i64 2 + %2497 = insertelement <4 x float> %2496, float %2046, i64 3 + %2498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2370, <4 x float> %2497, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2371, <4 x float> %2498, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2500 = insertelement <4 x float> poison, float %2047, i64 0 + %2501 = insertelement <4 x float> %2500, float %2048, i64 1 + %2502 = insertelement <4 x float> %2501, float %2049, i64 2 + %2503 = insertelement <4 x float> %2502, float %2050, i64 3 + %2504 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2372, <4 x float> %2503, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2505 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2373, <4 x float> %2504, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2506 = insertelement <4 x float> poison, float %2051, i64 0 + %2507 = insertelement <4 x float> %2506, float %2052, i64 1 + %2508 = insertelement <4 x float> %2507, float %2053, i64 2 + %2509 = insertelement <4 x float> %2508, float %2054, i64 3 + %2510 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2372, <4 x float> %2509, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2511 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2373, <4 x float> %2510, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2512 = insertelement <4 x float> poison, float %2055, i64 0 + %2513 = insertelement <4 x float> %2512, float %2056, i64 1 + %2514 = insertelement <4 x float> %2513, float %2057, i64 2 + %2515 = insertelement <4 x float> %2514, float %2058, i64 3 + %2516 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2370, <4 x float> %2515, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2517 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2371, <4 x float> %2516, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2518 = insertelement <4 x float> poison, float %2059, i64 0 + %2519 = insertelement <4 x float> %2518, float %2060, i64 1 + %2520 = insertelement <4 x float> %2519, float %2061, i64 2 + %2521 = insertelement <4 x float> %2520, float %2062, i64 3 + %2522 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2370, <4 x float> %2521, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2523 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2371, <4 x float> %2522, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2524 = insertelement <4 x float> poison, float %2063, i64 0 + %2525 = insertelement <4 x float> %2524, float %2064, i64 1 + %2526 = insertelement <4 x float> %2525, float %2065, i64 2 + %2527 = insertelement <4 x float> %2526, float %2066, i64 3 + %2528 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2372, <4 x float> %2527, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2529 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2373, <4 x float> %2528, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2530 = insertelement <4 x float> poison, float %2067, i64 0 + %2531 = insertelement <4 x float> %2530, float %2068, i64 1 + %2532 = insertelement <4 x float> %2531, float %2069, i64 2 + %2533 = insertelement <4 x float> %2532, float %2070, i64 3 + %2534 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2372, <4 x float> %2533, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2535 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2373, <4 x float> %2534, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %2536 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1022 + %2537 = load <8 x half>, ptr addrspace(3) %2536, align 16 + %2538 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1026 + %2539 = load <8 x half>, ptr addrspace(3) %2538, align 16 + %2540 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> + %2541 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> + %2542 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> + %2543 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> + %2544 = insertelement <4 x float> poison, float %2071, i64 0 + %2545 = insertelement <4 x float> %2544, float %2072, i64 1 + %2546 = insertelement <4 x float> %2545, float %2073, i64 2 + %2547 = insertelement <4 x float> %2546, float %2074, i64 3 + %2548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2540, <4 x float> %2547, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2541, <4 x float> %2548, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2550 = insertelement <4 x float> poison, float %2075, i64 0 + %2551 = insertelement <4 x float> %2550, float %2076, i64 1 + %2552 = insertelement <4 x float> %2551, float %2077, i64 2 + %2553 = insertelement <4 x float> %2552, float %2078, i64 3 + %2554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2540, <4 x float> %2553, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2541, <4 x float> %2554, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2556 = insertelement <4 x float> poison, float %2079, i64 0 + %2557 = insertelement <4 x float> %2556, float %2080, i64 1 + %2558 = insertelement <4 x float> %2557, float %2081, i64 2 + %2559 = insertelement <4 x float> %2558, float %2082, i64 3 + %2560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2542, <4 x float> %2559, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2543, <4 x float> %2560, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2562 = insertelement <4 x float> poison, float %2083, i64 0 + %2563 = insertelement <4 x float> %2562, float %2084, i64 1 + %2564 = insertelement <4 x float> %2563, float %2085, i64 2 + %2565 = insertelement <4 x float> %2564, float %2086, i64 3 + %2566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2542, <4 x float> %2565, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2543, <4 x float> %2566, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2568 = insertelement <4 x float> poison, float %2087, i64 0 + %2569 = insertelement <4 x float> %2568, float %2088, i64 1 + %2570 = insertelement <4 x float> %2569, float %2089, i64 2 + %2571 = insertelement <4 x float> %2570, float %2090, i64 3 + %2572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2540, <4 x float> %2571, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2541, <4 x float> %2572, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2574 = insertelement <4 x float> poison, float %2091, i64 0 + %2575 = insertelement <4 x float> %2574, float %2092, i64 1 + %2576 = insertelement <4 x float> %2575, float %2093, i64 2 + %2577 = insertelement <4 x float> %2576, float %2094, i64 3 + %2578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2540, <4 x float> %2577, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2541, <4 x float> %2578, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2580 = insertelement <4 x float> poison, float %2095, i64 0 + %2581 = insertelement <4 x float> %2580, float %2096, i64 1 + %2582 = insertelement <4 x float> %2581, float %2097, i64 2 + %2583 = insertelement <4 x float> %2582, float %2098, i64 3 + %2584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2542, <4 x float> %2583, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2543, <4 x float> %2584, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2586 = insertelement <4 x float> poison, float %2099, i64 0 + %2587 = insertelement <4 x float> %2586, float %2100, i64 1 + %2588 = insertelement <4 x float> %2587, float %2101, i64 2 + %2589 = insertelement <4 x float> %2588, float %2102, i64 3 + %2590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2542, <4 x float> %2589, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2543, <4 x float> %2590, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2592 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1030 + %2593 = load <8 x half>, ptr addrspace(3) %2592, align 16 + %2594 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1034 + %2595 = load <8 x half>, ptr addrspace(3) %2594, align 16 + %2596 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> + %2597 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> + %2598 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> + %2599 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> + %2600 = insertelement <4 x float> poison, float %2135, i64 0 + %2601 = insertelement <4 x float> %2600, float %2136, i64 1 + %2602 = insertelement <4 x float> %2601, float %2137, i64 2 + %2603 = insertelement <4 x float> %2602, float %2138, i64 3 + %2604 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2596, <4 x float> %2603, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2605 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2597, <4 x float> %2604, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2606 = insertelement <4 x float> poison, float %2139, i64 0 + %2607 = insertelement <4 x float> %2606, float %2140, i64 1 + %2608 = insertelement <4 x float> %2607, float %2141, i64 2 + %2609 = insertelement <4 x float> %2608, float %2142, i64 3 + %2610 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2596, <4 x float> %2609, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2611 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2597, <4 x float> %2610, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2612 = insertelement <4 x float> poison, float %2143, i64 0 + %2613 = insertelement <4 x float> %2612, float %2144, i64 1 + %2614 = insertelement <4 x float> %2613, float %2145, i64 2 + %2615 = insertelement <4 x float> %2614, float %2146, i64 3 + %2616 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2598, <4 x float> %2615, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2617 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2599, <4 x float> %2616, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2618 = insertelement <4 x float> poison, float %2147, i64 0 + %2619 = insertelement <4 x float> %2618, float %2148, i64 1 + %2620 = insertelement <4 x float> %2619, float %2149, i64 2 + %2621 = insertelement <4 x float> %2620, float %2150, i64 3 + %2622 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2598, <4 x float> %2621, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2623 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2599, <4 x float> %2622, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2624 = insertelement <4 x float> poison, float %2151, i64 0 + %2625 = insertelement <4 x float> %2624, float %2152, i64 1 + %2626 = insertelement <4 x float> %2625, float %2153, i64 2 + %2627 = insertelement <4 x float> %2626, float %2154, i64 3 + %2628 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2596, <4 x float> %2627, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2629 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2597, <4 x float> %2628, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2630 = insertelement <4 x float> poison, float %2155, i64 0 + %2631 = insertelement <4 x float> %2630, float %2156, i64 1 + %2632 = insertelement <4 x float> %2631, float %2157, i64 2 + %2633 = insertelement <4 x float> %2632, float %2158, i64 3 + %2634 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2596, <4 x float> %2633, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2635 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2597, <4 x float> %2634, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2636 = insertelement <4 x float> poison, float %2159, i64 0 + %2637 = insertelement <4 x float> %2636, float %2160, i64 1 + %2638 = insertelement <4 x float> %2637, float %2161, i64 2 + %2639 = insertelement <4 x float> %2638, float %2162, i64 3 + %2640 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2598, <4 x float> %2639, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2641 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2599, <4 x float> %2640, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2642 = insertelement <4 x float> poison, float %2163, i64 0 + %2643 = insertelement <4 x float> %2642, float %2164, i64 1 + %2644 = insertelement <4 x float> %2643, float %2165, i64 2 + %2645 = insertelement <4 x float> %2644, float %2166, i64 3 + %2646 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2598, <4 x float> %2645, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2647 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2599, <4 x float> %2646, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %2648 = insertelement <4 x float> poison, float %2103, i64 0 + %2649 = insertelement <4 x float> %2648, float %2104, i64 1 + %2650 = insertelement <4 x float> %2649, float %2105, i64 2 + %2651 = insertelement <4 x float> %2650, float %2106, i64 3 + %2652 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2540, <4 x float> %2651, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2653 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2541, <4 x float> %2652, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2654 = insertelement <4 x float> poison, float %2107, i64 0 + %2655 = insertelement <4 x float> %2654, float %2108, i64 1 + %2656 = insertelement <4 x float> %2655, float %2109, i64 2 + %2657 = insertelement <4 x float> %2656, float %2110, i64 3 + %2658 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2540, <4 x float> %2657, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2659 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2541, <4 x float> %2658, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2660 = insertelement <4 x float> poison, float %2111, i64 0 + %2661 = insertelement <4 x float> %2660, float %2112, i64 1 + %2662 = insertelement <4 x float> %2661, float %2113, i64 2 + %2663 = insertelement <4 x float> %2662, float %2114, i64 3 + %2664 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2542, <4 x float> %2663, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2665 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2543, <4 x float> %2664, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2666 = insertelement <4 x float> poison, float %2115, i64 0 + %2667 = insertelement <4 x float> %2666, float %2116, i64 1 + %2668 = insertelement <4 x float> %2667, float %2117, i64 2 + %2669 = insertelement <4 x float> %2668, float %2118, i64 3 + %2670 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2542, <4 x float> %2669, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2671 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2543, <4 x float> %2670, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2672 = insertelement <4 x float> poison, float %2119, i64 0 + %2673 = insertelement <4 x float> %2672, float %2120, i64 1 + %2674 = insertelement <4 x float> %2673, float %2121, i64 2 + %2675 = insertelement <4 x float> %2674, float %2122, i64 3 + %2676 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2540, <4 x float> %2675, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2677 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2541, <4 x float> %2676, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2678 = insertelement <4 x float> poison, float %2123, i64 0 + %2679 = insertelement <4 x float> %2678, float %2124, i64 1 + %2680 = insertelement <4 x float> %2679, float %2125, i64 2 + %2681 = insertelement <4 x float> %2680, float %2126, i64 3 + %2682 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2540, <4 x float> %2681, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2683 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2541, <4 x float> %2682, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2684 = insertelement <4 x float> poison, float %2127, i64 0 + %2685 = insertelement <4 x float> %2684, float %2128, i64 1 + %2686 = insertelement <4 x float> %2685, float %2129, i64 2 + %2687 = insertelement <4 x float> %2686, float %2130, i64 3 + %2688 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2542, <4 x float> %2687, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2689 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2543, <4 x float> %2688, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2690 = insertelement <4 x float> poison, float %2131, i64 0 + %2691 = insertelement <4 x float> %2690, float %2132, i64 1 + %2692 = insertelement <4 x float> %2691, float %2133, i64 2 + %2693 = insertelement <4 x float> %2692, float %2134, i64 3 + %2694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2542, <4 x float> %2693, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2543, <4 x float> %2694, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2696 = insertelement <4 x float> poison, float %2167, i64 0 + %2697 = insertelement <4 x float> %2696, float %2168, i64 1 + %2698 = insertelement <4 x float> %2697, float %2169, i64 2 + %2699 = insertelement <4 x float> %2698, float %2170, i64 3 + %2700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2596, <4 x float> %2699, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2597, <4 x float> %2700, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2702 = insertelement <4 x float> poison, float %2171, i64 0 + %2703 = insertelement <4 x float> %2702, float %2172, i64 1 + %2704 = insertelement <4 x float> %2703, float %2173, i64 2 + %2705 = insertelement <4 x float> %2704, float %2174, i64 3 + %2706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2596, <4 x float> %2705, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2597, <4 x float> %2706, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2708 = insertelement <4 x float> poison, float %2175, i64 0 + %2709 = insertelement <4 x float> %2708, float %2176, i64 1 + %2710 = insertelement <4 x float> %2709, float %2177, i64 2 + %2711 = insertelement <4 x float> %2710, float %2178, i64 3 + %2712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2598, <4 x float> %2711, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2599, <4 x float> %2712, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2714 = insertelement <4 x float> poison, float %2179, i64 0 + %2715 = insertelement <4 x float> %2714, float %2180, i64 1 + %2716 = insertelement <4 x float> %2715, float %2181, i64 2 + %2717 = insertelement <4 x float> %2716, float %2182, i64 3 + %2718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2598, <4 x float> %2717, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2599, <4 x float> %2718, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2720 = insertelement <4 x float> poison, float %2183, i64 0 + %2721 = insertelement <4 x float> %2720, float %2184, i64 1 + %2722 = insertelement <4 x float> %2721, float %2185, i64 2 + %2723 = insertelement <4 x float> %2722, float %2186, i64 3 + %2724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2596, <4 x float> %2723, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2597, <4 x float> %2724, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2726 = insertelement <4 x float> poison, float %2187, i64 0 + %2727 = insertelement <4 x float> %2726, float %2188, i64 1 + %2728 = insertelement <4 x float> %2727, float %2189, i64 2 + %2729 = insertelement <4 x float> %2728, float %2190, i64 3 + %2730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2596, <4 x float> %2729, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2597, <4 x float> %2730, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2732 = insertelement <4 x float> poison, float %2191, i64 0 + %2733 = insertelement <4 x float> %2732, float %2192, i64 1 + %2734 = insertelement <4 x float> %2733, float %2193, i64 2 + %2735 = insertelement <4 x float> %2734, float %2194, i64 3 + %2736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2598, <4 x float> %2735, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2599, <4 x float> %2736, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2738 = insertelement <4 x float> poison, float %2195, i64 0 + %2739 = insertelement <4 x float> %2738, float %2196, i64 1 + %2740 = insertelement <4 x float> %2739, float %2197, i64 2 + %2741 = insertelement <4 x float> %2740, float %2198, i64 3 + %2742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2598, <4 x float> %2741, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2599, <4 x float> %2742, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %2744 = or disjoint i32 %.pre-phi1042, 2048 + %2745 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1042 + %2746 = load <8 x half>, ptr addrspace(3) %2745, align 16 + %2747 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2744 + %2748 = load <8 x half>, ptr addrspace(3) %2747, align 16 + %2749 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1044 + %2750 = load <8 x half>, ptr addrspace(3) %2749, align 16 + %2751 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1046 + %2752 = load <8 x half>, ptr addrspace(3) %2751, align 16 + %2753 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> + %2754 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> + %2755 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> + %2756 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> + %2757 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> + %2758 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> + %2759 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> + %2760 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> + %2761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2753, <4 x float> %2323, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2762 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2754, <4 x float> %2761, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2763 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2753, <4 x float> %2329, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2764 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2754, <4 x float> %2763, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2765 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2755, <4 x float> %2335, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2766 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2756, <4 x float> %2765, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2767 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2755, <4 x float> %2341, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2768 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2756, <4 x float> %2767, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2769 = or disjoint i32 %.pre-phi1048, 2048 + %2770 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1048 + %2771 = load <8 x half>, ptr addrspace(3) %2770, align 16 + %2772 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2769 + %2773 = load <8 x half>, ptr addrspace(3) %2772, align 16 + %2774 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> + %2775 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> + %2776 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> + %2777 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> + %2778 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2753, <4 x float> %2351, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2779 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2754, <4 x float> %2778, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2780 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2753, <4 x float> %2357, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2781 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2754, <4 x float> %2780, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2755, <4 x float> %2363, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2756, <4 x float> %2782, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2784 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2755, <4 x float> %2369, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2785 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2756, <4 x float> %2784, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2786 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1050 + %2787 = load <8 x half>, ptr addrspace(3) %2786, align 16 + %2788 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1052 + %2789 = load <8 x half>, ptr addrspace(3) %2788, align 16 + %2790 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> + %2791 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> + %2792 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> + %2793 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> + %2794 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2790, <4 x float> %2379, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2795 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2791, <4 x float> %2794, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2796 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2790, <4 x float> %2385, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2797 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2791, <4 x float> %2796, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2792, <4 x float> %2391, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2793, <4 x float> %2798, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2800 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2792, <4 x float> %2397, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2801 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2793, <4 x float> %2800, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2802 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2790, <4 x float> %2403, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2803 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2791, <4 x float> %2802, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2790, <4 x float> %2409, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2791, <4 x float> %2804, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2806 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2792, <4 x float> %2415, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2807 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2793, <4 x float> %2806, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2808 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2792, <4 x float> %2421, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2809 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2793, <4 x float> %2808, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %2810 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1054 + %2811 = load <8 x half>, ptr addrspace(3) %2810, align 16 + %2812 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1056 + %2813 = load <8 x half>, ptr addrspace(3) %2812, align 16 + %2814 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> + %2815 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> + %2816 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> + %2817 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> + %2818 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2753, <4 x float> %2436, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2819 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2754, <4 x float> %2818, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2820 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2753, <4 x float> %2442, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2821 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2754, <4 x float> %2820, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2755, <4 x float> %2448, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2756, <4 x float> %2822, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2824 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2755, <4 x float> %2454, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2825 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2756, <4 x float> %2824, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2826 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1058 + %2827 = load <8 x half>, ptr addrspace(3) %2826, align 16 + %2828 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1060 + %2829 = load <8 x half>, ptr addrspace(3) %2828, align 16 + %2830 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> + %2831 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> + %2832 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> + %2833 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> + %2834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2753, <4 x float> %2469, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2754, <4 x float> %2834, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2836 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2753, <4 x float> %2475, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2837 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2754, <4 x float> %2836, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2838 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2755, <4 x float> %2481, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2839 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2756, <4 x float> %2838, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2755, <4 x float> %2487, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2756, <4 x float> %2840, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2842 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2790, <4 x float> %2493, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2843 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2791, <4 x float> %2842, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2844 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2790, <4 x float> %2499, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2845 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2791, <4 x float> %2844, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2846 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2792, <4 x float> %2505, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2847 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2793, <4 x float> %2846, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2848 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2792, <4 x float> %2511, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2849 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2793, <4 x float> %2848, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2850 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2790, <4 x float> %2517, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2851 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2791, <4 x float> %2850, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2852 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2790, <4 x float> %2523, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2853 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2791, <4 x float> %2852, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2854 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2792, <4 x float> %2529, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2855 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2793, <4 x float> %2854, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2856 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2792, <4 x float> %2535, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2857 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2793, <4 x float> %2856, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %2858 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1062 + %2859 = load <8 x half>, ptr addrspace(3) %2858, align 16 + %2860 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1064 + %2861 = load <8 x half>, ptr addrspace(3) %2860, align 16 + %2862 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> + %2863 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> + %2864 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> + %2865 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> + %2866 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2862, <4 x float> %2549, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2867 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2863, <4 x float> %2866, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2868 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2862, <4 x float> %2555, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2869 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2863, <4 x float> %2868, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2870 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2864, <4 x float> %2561, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2871 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2865, <4 x float> %2870, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2872 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2864, <4 x float> %2567, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2873 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2865, <4 x float> %2872, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2874 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2862, <4 x float> %2573, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2875 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2863, <4 x float> %2874, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2876 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2862, <4 x float> %2579, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2877 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2863, <4 x float> %2876, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2864, <4 x float> %2585, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2865, <4 x float> %2878, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2880 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2864, <4 x float> %2591, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2881 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2865, <4 x float> %2880, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2882 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1066 + %2883 = load <8 x half>, ptr addrspace(3) %2882, align 16 + %2884 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1068 + %2885 = load <8 x half>, ptr addrspace(3) %2884, align 16 + %2886 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> + %2887 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> + %2888 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> + %2889 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> + %2890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2886, <4 x float> %2605, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2887, <4 x float> %2890, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2892 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2886, <4 x float> %2611, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2893 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2887, <4 x float> %2892, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2894 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2888, <4 x float> %2617, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2895 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2889, <4 x float> %2894, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2888, <4 x float> %2623, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2889, <4 x float> %2896, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2898 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2886, <4 x float> %2629, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2899 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2887, <4 x float> %2898, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2900 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2886, <4 x float> %2635, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2901 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2887, <4 x float> %2900, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2902 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2888, <4 x float> %2641, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2903 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2889, <4 x float> %2902, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2904 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2888, <4 x float> %2647, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2905 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2889, <4 x float> %2904, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %2906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2862, <4 x float> %2653, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2863, <4 x float> %2906, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2908 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2862, <4 x float> %2659, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2909 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2863, <4 x float> %2908, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2910 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2864, <4 x float> %2665, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2911 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2865, <4 x float> %2910, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2864, <4 x float> %2671, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2865, <4 x float> %2912, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2914 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2862, <4 x float> %2677, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2915 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2863, <4 x float> %2914, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2916 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2862, <4 x float> %2683, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2917 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2863, <4 x float> %2916, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2864, <4 x float> %2689, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2865, <4 x float> %2918, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2920 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2864, <4 x float> %2695, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2921 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2865, <4 x float> %2920, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2922 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2886, <4 x float> %2701, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2923 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2887, <4 x float> %2922, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2886, <4 x float> %2707, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2887, <4 x float> %2924, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2926 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2888, <4 x float> %2713, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2927 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2889, <4 x float> %2926, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2928 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2888, <4 x float> %2719, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2929 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2889, <4 x float> %2928, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2930 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2886, <4 x float> %2725, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2931 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2887, <4 x float> %2930, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2932 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2886, <4 x float> %2731, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2933 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2887, <4 x float> %2932, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2934 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2888, <4 x float> %2737, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2935 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2889, <4 x float> %2934, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2936 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2888, <4 x float> %2743, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %2937 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2889, <4 x float> %2936, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %2938 = shufflevector <2 x half> %2199, <2 x half> %2263, <8 x i32> + %2939 = shufflevector <2 x half> %2264, <2 x half> poison, <8 x i32> + %2940 = shufflevector <8 x half> %2938, <8 x half> %2939, <8 x i32> + %2941 = shufflevector <2 x half> %2200, <2 x half> poison, <8 x i32> + %2942 = shufflevector <8 x half> %2940, <8 x half> %2941, <8 x i32> + store <8 x half> %2942, ptr addrspace(3) %199, align 16 + %2943 = shufflevector <2 x half> %2201, <2 x half> %2265, <8 x i32> + %2944 = shufflevector <2 x half> %2266, <2 x half> poison, <8 x i32> + %2945 = shufflevector <8 x half> %2943, <8 x half> %2944, <8 x i32> + %2946 = shufflevector <2 x half> %2202, <2 x half> poison, <8 x i32> + %2947 = shufflevector <8 x half> %2945, <8 x half> %2946, <8 x i32> + store <8 x half> %2947, ptr addrspace(3) %201, align 16 + %2948 = shufflevector <2 x half> %2203, <2 x half> %2267, <8 x i32> + %2949 = shufflevector <2 x half> %2268, <2 x half> poison, <8 x i32> + %2950 = shufflevector <8 x half> %2948, <8 x half> %2949, <8 x i32> + %2951 = shufflevector <2 x half> %2204, <2 x half> poison, <8 x i32> + %2952 = shufflevector <8 x half> %2950, <8 x half> %2951, <8 x i32> + store <8 x half> %2952, ptr addrspace(3) %203, align 16 + %2953 = shufflevector <2 x half> %2205, <2 x half> %2269, <8 x i32> + %2954 = shufflevector <2 x half> %2270, <2 x half> poison, <8 x i32> + %2955 = shufflevector <8 x half> %2953, <8 x half> %2954, <8 x i32> + %2956 = shufflevector <2 x half> %2206, <2 x half> poison, <8 x i32> + %2957 = shufflevector <8 x half> %2955, <8 x half> %2956, <8 x i32> + store <8 x half> %2957, ptr addrspace(3) %205, align 16 + %2958 = shufflevector <2 x half> %2207, <2 x half> %2271, <8 x i32> + %2959 = shufflevector <2 x half> %2272, <2 x half> poison, <8 x i32> + %2960 = shufflevector <8 x half> %2958, <8 x half> %2959, <8 x i32> + %2961 = shufflevector <2 x half> %2208, <2 x half> poison, <8 x i32> + %2962 = shufflevector <8 x half> %2960, <8 x half> %2961, <8 x i32> + store <8 x half> %2962, ptr addrspace(3) %207, align 16 + %2963 = shufflevector <2 x half> %2209, <2 x half> %2273, <8 x i32> + %2964 = shufflevector <2 x half> %2274, <2 x half> poison, <8 x i32> + %2965 = shufflevector <8 x half> %2963, <8 x half> %2964, <8 x i32> + %2966 = shufflevector <2 x half> %2210, <2 x half> poison, <8 x i32> + %2967 = shufflevector <8 x half> %2965, <8 x half> %2966, <8 x i32> + store <8 x half> %2967, ptr addrspace(3) %209, align 16 + %2968 = shufflevector <2 x half> %2211, <2 x half> %2275, <8 x i32> + %2969 = shufflevector <2 x half> %2276, <2 x half> poison, <8 x i32> + %2970 = shufflevector <8 x half> %2968, <8 x half> %2969, <8 x i32> + %2971 = shufflevector <2 x half> %2212, <2 x half> poison, <8 x i32> + %2972 = shufflevector <8 x half> %2970, <8 x half> %2971, <8 x i32> + store <8 x half> %2972, ptr addrspace(3) %211, align 16 + %2973 = shufflevector <2 x half> %2213, <2 x half> %2277, <8 x i32> + %2974 = shufflevector <2 x half> %2278, <2 x half> poison, <8 x i32> + %2975 = shufflevector <8 x half> %2973, <8 x half> %2974, <8 x i32> + %2976 = shufflevector <2 x half> %2214, <2 x half> poison, <8 x i32> + %2977 = shufflevector <8 x half> %2975, <8 x half> %2976, <8 x i32> + store <8 x half> %2977, ptr addrspace(3) %213, align 16 + %2978 = shufflevector <2 x half> %2215, <2 x half> %2279, <8 x i32> + %2979 = shufflevector <2 x half> %2280, <2 x half> poison, <8 x i32> + %2980 = shufflevector <8 x half> %2978, <8 x half> %2979, <8 x i32> + %2981 = shufflevector <2 x half> %2216, <2 x half> poison, <8 x i32> + %2982 = shufflevector <8 x half> %2980, <8 x half> %2981, <8 x i32> + store <8 x half> %2982, ptr addrspace(3) %214, align 16 + %2983 = shufflevector <2 x half> %2217, <2 x half> %2281, <8 x i32> + %2984 = shufflevector <2 x half> %2282, <2 x half> poison, <8 x i32> + %2985 = shufflevector <8 x half> %2983, <8 x half> %2984, <8 x i32> + %2986 = shufflevector <2 x half> %2218, <2 x half> poison, <8 x i32> + %2987 = shufflevector <8 x half> %2985, <8 x half> %2986, <8 x i32> + store <8 x half> %2987, ptr addrspace(3) %215, align 16 + %2988 = shufflevector <2 x half> %2219, <2 x half> %2283, <8 x i32> + %2989 = shufflevector <2 x half> %2284, <2 x half> poison, <8 x i32> + %2990 = shufflevector <8 x half> %2988, <8 x half> %2989, <8 x i32> + %2991 = shufflevector <2 x half> %2220, <2 x half> poison, <8 x i32> + %2992 = shufflevector <8 x half> %2990, <8 x half> %2991, <8 x i32> + store <8 x half> %2992, ptr addrspace(3) %216, align 16 + %2993 = shufflevector <2 x half> %2221, <2 x half> %2285, <8 x i32> + %2994 = shufflevector <2 x half> %2286, <2 x half> poison, <8 x i32> + %2995 = shufflevector <8 x half> %2993, <8 x half> %2994, <8 x i32> + %2996 = shufflevector <2 x half> %2222, <2 x half> poison, <8 x i32> + %2997 = shufflevector <8 x half> %2995, <8 x half> %2996, <8 x i32> + store <8 x half> %2997, ptr addrspace(3) %217, align 16 + %2998 = shufflevector <2 x half> %2223, <2 x half> %2287, <8 x i32> + %2999 = shufflevector <2 x half> %2288, <2 x half> poison, <8 x i32> + %3000 = shufflevector <8 x half> %2998, <8 x half> %2999, <8 x i32> + %3001 = shufflevector <2 x half> %2224, <2 x half> poison, <8 x i32> + %3002 = shufflevector <8 x half> %3000, <8 x half> %3001, <8 x i32> + store <8 x half> %3002, ptr addrspace(3) %218, align 16 + %3003 = shufflevector <2 x half> %2225, <2 x half> %2289, <8 x i32> + %3004 = shufflevector <2 x half> %2290, <2 x half> poison, <8 x i32> + %3005 = shufflevector <8 x half> %3003, <8 x half> %3004, <8 x i32> + %3006 = shufflevector <2 x half> %2226, <2 x half> poison, <8 x i32> + %3007 = shufflevector <8 x half> %3005, <8 x half> %3006, <8 x i32> + store <8 x half> %3007, ptr addrspace(3) %219, align 16 + %3008 = shufflevector <2 x half> %2227, <2 x half> %2291, <8 x i32> + %3009 = shufflevector <2 x half> %2292, <2 x half> poison, <8 x i32> + %3010 = shufflevector <8 x half> %3008, <8 x half> %3009, <8 x i32> + %3011 = shufflevector <2 x half> %2228, <2 x half> poison, <8 x i32> + %3012 = shufflevector <8 x half> %3010, <8 x half> %3011, <8 x i32> + store <8 x half> %3012, ptr addrspace(3) %220, align 16 + %3013 = shufflevector <2 x half> %2229, <2 x half> %2293, <8 x i32> + %3014 = shufflevector <2 x half> %2294, <2 x half> poison, <8 x i32> + %3015 = shufflevector <8 x half> %3013, <8 x half> %3014, <8 x i32> + %3016 = shufflevector <2 x half> %2230, <2 x half> poison, <8 x i32> + %3017 = shufflevector <8 x half> %3015, <8 x half> %3016, <8 x i32> + store <8 x half> %3017, ptr addrspace(3) %221, align 16 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %3018 = load <8 x half>, ptr addrspace(3) %243, align 16 + %3019 = load <8 x half>, ptr addrspace(3) %245, align 16 + %3020 = load <8 x half>, ptr addrspace(3) %233, align 16 + %3021 = load <8 x half>, ptr addrspace(3) %235, align 16 + %3022 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> + %3023 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> + %3024 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> + %3025 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> + %3026 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> + %3027 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> + %3028 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> + %3029 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> + %3030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3022, <4 x float> %2762, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3023, <4 x float> %3030, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3032 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3022, <4 x float> %2764, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3033 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3023, <4 x float> %3032, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3034 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3024, <4 x float> %2766, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3035 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3025, <4 x float> %3034, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3024, <4 x float> %2768, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3025, <4 x float> %3036, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3038 = load <8 x half>, ptr addrspace(3) %258, align 16 + %3039 = load <8 x half>, ptr addrspace(3) %260, align 16 + %3040 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> + %3041 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> + %3042 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> + %3043 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> + %3044 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3022, <4 x float> %2779, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3045 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3023, <4 x float> %3044, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3022, <4 x float> %2781, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3023, <4 x float> %3046, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3048 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3024, <4 x float> %2783, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3049 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3025, <4 x float> %3048, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3050 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3024, <4 x float> %2785, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3051 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3025, <4 x float> %3050, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3052 = load <8 x half>, ptr addrspace(3) %2423, align 16 + %3053 = load <8 x half>, ptr addrspace(3) %2425, align 16 + %3054 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> + %3055 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> + %3056 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> + %3057 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> + %3058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3022, <4 x float> %2819, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3023, <4 x float> %3058, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3060 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3022, <4 x float> %2821, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3061 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3023, <4 x float> %3060, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3062 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3024, <4 x float> %2823, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3063 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3025, <4 x float> %3062, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3024, <4 x float> %2825, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3025, <4 x float> %3064, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3066 = load <8 x half>, ptr addrspace(3) %2456, align 16 + %3067 = load <8 x half>, ptr addrspace(3) %2458, align 16 + %3068 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> + %3069 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> + %3070 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> + %3071 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> + %3072 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3022, <4 x float> %2835, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3073 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3023, <4 x float> %3072, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3074 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3022, <4 x float> %2837, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3075 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3023, <4 x float> %3074, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3024, <4 x float> %2839, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3025, <4 x float> %3076, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3078 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3024, <4 x float> %2841, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3079 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3025, <4 x float> %3078, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %3080 = load <8 x half>, ptr addrspace(3) %251, align 16 + %3081 = load <8 x half>, ptr addrspace(3) %253, align 16 + %3082 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> + %3083 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> + %3084 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> + %3085 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> + %3086 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3082, <4 x float> %2795, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3087 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3083, <4 x float> %3086, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3082, <4 x float> %2797, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3083, <4 x float> %3088, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3090 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3084, <4 x float> %2799, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3091 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3085, <4 x float> %3090, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3092 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3084, <4 x float> %2801, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3093 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3085, <4 x float> %3092, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3094 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3082, <4 x float> %2803, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3095 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3083, <4 x float> %3094, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3096 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3082, <4 x float> %2805, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3097 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3083, <4 x float> %3096, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3084, <4 x float> %2807, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3085, <4 x float> %3098, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3100 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3084, <4 x float> %2809, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3101 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3085, <4 x float> %3100, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3102 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3082, <4 x float> %2843, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3103 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3083, <4 x float> %3102, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3082, <4 x float> %2845, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3083, <4 x float> %3104, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3106 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3084, <4 x float> %2847, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3107 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3085, <4 x float> %3106, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3108 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3084, <4 x float> %2849, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3109 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3085, <4 x float> %3108, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3082, <4 x float> %2851, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3083, <4 x float> %3110, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3112 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3082, <4 x float> %2853, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3113 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3083, <4 x float> %3112, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3114 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3084, <4 x float> %2855, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3115 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3085, <4 x float> %3114, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3084, <4 x float> %2857, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3085, <4 x float> %3116, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %3118 = load <8 x half>, ptr addrspace(3) %2536, align 16 + %3119 = load <8 x half>, ptr addrspace(3) %2538, align 16 + %3120 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> + %3121 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> + %3122 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> + %3123 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> + %3124 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3120, <4 x float> %2867, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3125 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3121, <4 x float> %3124, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3126 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3120, <4 x float> %2869, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3127 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3121, <4 x float> %3126, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3122, <4 x float> %2871, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3123, <4 x float> %3128, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3130 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3122, <4 x float> %2873, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3131 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3123, <4 x float> %3130, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3132 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3120, <4 x float> %2875, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3133 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3121, <4 x float> %3132, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3120, <4 x float> %2877, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3121, <4 x float> %3134, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3136 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3122, <4 x float> %2879, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3137 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3123, <4 x float> %3136, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3138 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3122, <4 x float> %2881, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3139 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3123, <4 x float> %3138, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3120, <4 x float> %2907, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3121, <4 x float> %3140, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3142 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3120, <4 x float> %2909, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3143 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3121, <4 x float> %3142, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3144 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3122, <4 x float> %2911, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3145 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3123, <4 x float> %3144, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3146 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3122, <4 x float> %2913, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3147 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3123, <4 x float> %3146, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3148 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3120, <4 x float> %2915, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3149 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3121, <4 x float> %3148, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3120, <4 x float> %2917, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3121, <4 x float> %3150, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3152 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3122, <4 x float> %2919, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3153 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3123, <4 x float> %3152, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3154 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3122, <4 x float> %2921, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3155 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3123, <4 x float> %3154, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %3156 = load <8 x half>, ptr addrspace(3) %2592, align 16 + %3157 = load <8 x half>, ptr addrspace(3) %2594, align 16 + %3158 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> + %3159 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> + %3160 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> + %3161 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> + %3162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3158, <4 x float> %2891, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3159, <4 x float> %3162, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3164 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3158, <4 x float> %2893, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3165 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3159, <4 x float> %3164, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3166 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3160, <4 x float> %2895, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3167 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3161, <4 x float> %3166, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3160, <4 x float> %2897, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3161, <4 x float> %3168, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3170 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3158, <4 x float> %2899, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3171 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3159, <4 x float> %3170, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3172 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3158, <4 x float> %2901, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3173 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3159, <4 x float> %3172, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3160, <4 x float> %2903, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3161, <4 x float> %3174, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3176 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3160, <4 x float> %2905, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3177 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3161, <4 x float> %3176, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3178 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3158, <4 x float> %2923, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3179 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3159, <4 x float> %3178, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3158, <4 x float> %2925, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3159, <4 x float> %3180, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3182 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3160, <4 x float> %2927, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3183 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3161, <4 x float> %3182, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3184 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3160, <4 x float> %2929, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3185 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3161, <4 x float> %3184, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3158, <4 x float> %2931, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3159, <4 x float> %3186, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3188 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3158, <4 x float> %2933, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3189 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3159, <4 x float> %3188, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3190 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3160, <4 x float> %2935, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3191 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3161, <4 x float> %3190, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3160, <4 x float> %2937, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3161, <4 x float> %3192, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %3194 = load <8 x half>, ptr addrspace(3) %2745, align 16 + %3195 = load <8 x half>, ptr addrspace(3) %2747, align 16 + %3196 = load <8 x half>, ptr addrspace(3) %2749, align 16 + %3197 = load <8 x half>, ptr addrspace(3) %2751, align 16 + %3198 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> + %3199 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> + %3200 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> + %3201 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> + %3202 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> + %3203 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> + %3204 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> + %3205 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> + %3206 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3198, <4 x float> %3031, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3207 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3199, <4 x float> %3206, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3208 = extractelement <4 x float> %3207, i64 0 + %3209 = extractelement <4 x float> %3207, i64 1 + %3210 = extractelement <4 x float> %3207, i64 2 + %3211 = extractelement <4 x float> %3207, i64 3 + %3212 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3198, <4 x float> %3033, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3213 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3199, <4 x float> %3212, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3214 = extractelement <4 x float> %3213, i64 0 + %3215 = extractelement <4 x float> %3213, i64 1 + %3216 = extractelement <4 x float> %3213, i64 2 + %3217 = extractelement <4 x float> %3213, i64 3 + %3218 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3200, <4 x float> %3035, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3219 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3201, <4 x float> %3218, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3220 = extractelement <4 x float> %3219, i64 0 + %3221 = extractelement <4 x float> %3219, i64 1 + %3222 = extractelement <4 x float> %3219, i64 2 + %3223 = extractelement <4 x float> %3219, i64 3 + %3224 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3200, <4 x float> %3037, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3225 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3201, <4 x float> %3224, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3226 = extractelement <4 x float> %3225, i64 0 + %3227 = extractelement <4 x float> %3225, i64 1 + %3228 = extractelement <4 x float> %3225, i64 2 + %3229 = extractelement <4 x float> %3225, i64 3 + %3230 = load <8 x half>, ptr addrspace(3) %2770, align 16 + %3231 = load <8 x half>, ptr addrspace(3) %2772, align 16 + %3232 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> + %3233 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> + %3234 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> + %3235 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> + %3236 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3198, <4 x float> %3045, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3237 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3199, <4 x float> %3236, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3238 = extractelement <4 x float> %3237, i64 0 + %3239 = extractelement <4 x float> %3237, i64 1 + %3240 = extractelement <4 x float> %3237, i64 2 + %3241 = extractelement <4 x float> %3237, i64 3 + %3242 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3198, <4 x float> %3047, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3243 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3199, <4 x float> %3242, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3244 = extractelement <4 x float> %3243, i64 0 + %3245 = extractelement <4 x float> %3243, i64 1 + %3246 = extractelement <4 x float> %3243, i64 2 + %3247 = extractelement <4 x float> %3243, i64 3 + %3248 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3200, <4 x float> %3049, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3249 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3201, <4 x float> %3248, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3250 = extractelement <4 x float> %3249, i64 0 + %3251 = extractelement <4 x float> %3249, i64 1 + %3252 = extractelement <4 x float> %3249, i64 2 + %3253 = extractelement <4 x float> %3249, i64 3 + %3254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3200, <4 x float> %3051, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3201, <4 x float> %3254, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3256 = extractelement <4 x float> %3255, i64 0 + %3257 = extractelement <4 x float> %3255, i64 1 + %3258 = extractelement <4 x float> %3255, i64 2 + %3259 = extractelement <4 x float> %3255, i64 3 + %3260 = load <8 x half>, ptr addrspace(3) %2786, align 16 + %3261 = load <8 x half>, ptr addrspace(3) %2788, align 16 + %3262 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> + %3263 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> + %3264 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> + %3265 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> + %3266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3262, <4 x float> %3087, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3263, <4 x float> %3266, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3268 = extractelement <4 x float> %3267, i64 0 + %3269 = extractelement <4 x float> %3267, i64 1 + %3270 = extractelement <4 x float> %3267, i64 2 + %3271 = extractelement <4 x float> %3267, i64 3 + %3272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3262, <4 x float> %3089, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3263, <4 x float> %3272, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3274 = extractelement <4 x float> %3273, i64 0 + %3275 = extractelement <4 x float> %3273, i64 1 + %3276 = extractelement <4 x float> %3273, i64 2 + %3277 = extractelement <4 x float> %3273, i64 3 + %3278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3264, <4 x float> %3091, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3265, <4 x float> %3278, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3280 = extractelement <4 x float> %3279, i64 0 + %3281 = extractelement <4 x float> %3279, i64 1 + %3282 = extractelement <4 x float> %3279, i64 2 + %3283 = extractelement <4 x float> %3279, i64 3 + %3284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3264, <4 x float> %3093, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3265, <4 x float> %3284, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3286 = extractelement <4 x float> %3285, i64 0 + %3287 = extractelement <4 x float> %3285, i64 1 + %3288 = extractelement <4 x float> %3285, i64 2 + %3289 = extractelement <4 x float> %3285, i64 3 + %3290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3262, <4 x float> %3095, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3263, <4 x float> %3290, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3292 = extractelement <4 x float> %3291, i64 0 + %3293 = extractelement <4 x float> %3291, i64 1 + %3294 = extractelement <4 x float> %3291, i64 2 + %3295 = extractelement <4 x float> %3291, i64 3 + %3296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3262, <4 x float> %3097, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3263, <4 x float> %3296, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3298 = extractelement <4 x float> %3297, i64 0 + %3299 = extractelement <4 x float> %3297, i64 1 + %3300 = extractelement <4 x float> %3297, i64 2 + %3301 = extractelement <4 x float> %3297, i64 3 + %3302 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3264, <4 x float> %3099, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3303 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3265, <4 x float> %3302, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3304 = extractelement <4 x float> %3303, i64 0 + %3305 = extractelement <4 x float> %3303, i64 1 + %3306 = extractelement <4 x float> %3303, i64 2 + %3307 = extractelement <4 x float> %3303, i64 3 + %3308 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3264, <4 x float> %3101, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3309 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3265, <4 x float> %3308, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3310 = extractelement <4 x float> %3309, i64 0 + %3311 = extractelement <4 x float> %3309, i64 1 + %3312 = extractelement <4 x float> %3309, i64 2 + %3313 = extractelement <4 x float> %3309, i64 3 + %3314 = load <8 x half>, ptr addrspace(3) %2810, align 16 + %3315 = load <8 x half>, ptr addrspace(3) %2812, align 16 + %3316 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> + %3317 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> + %3318 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> + %3319 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> + %3320 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3198, <4 x float> %3059, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3321 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3199, <4 x float> %3320, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3322 = extractelement <4 x float> %3321, i64 0 + %3323 = extractelement <4 x float> %3321, i64 1 + %3324 = extractelement <4 x float> %3321, i64 2 + %3325 = extractelement <4 x float> %3321, i64 3 + %3326 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3198, <4 x float> %3061, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3327 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3199, <4 x float> %3326, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3328 = extractelement <4 x float> %3327, i64 0 + %3329 = extractelement <4 x float> %3327, i64 1 + %3330 = extractelement <4 x float> %3327, i64 2 + %3331 = extractelement <4 x float> %3327, i64 3 + %3332 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3200, <4 x float> %3063, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3333 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3201, <4 x float> %3332, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3334 = extractelement <4 x float> %3333, i64 0 + %3335 = extractelement <4 x float> %3333, i64 1 + %3336 = extractelement <4 x float> %3333, i64 2 + %3337 = extractelement <4 x float> %3333, i64 3 + %3338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3200, <4 x float> %3065, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3201, <4 x float> %3338, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3340 = extractelement <4 x float> %3339, i64 0 + %3341 = extractelement <4 x float> %3339, i64 1 + %3342 = extractelement <4 x float> %3339, i64 2 + %3343 = extractelement <4 x float> %3339, i64 3 + %3344 = load <8 x half>, ptr addrspace(3) %2826, align 16 + %3345 = load <8 x half>, ptr addrspace(3) %2828, align 16 + %3346 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> + %3347 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> + %3348 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> + %3349 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> + %3350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3198, <4 x float> %3073, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3199, <4 x float> %3350, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3352 = extractelement <4 x float> %3351, i64 0 + %3353 = extractelement <4 x float> %3351, i64 1 + %3354 = extractelement <4 x float> %3351, i64 2 + %3355 = extractelement <4 x float> %3351, i64 3 + %3356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3198, <4 x float> %3075, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3199, <4 x float> %3356, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3358 = extractelement <4 x float> %3357, i64 0 + %3359 = extractelement <4 x float> %3357, i64 1 + %3360 = extractelement <4 x float> %3357, i64 2 + %3361 = extractelement <4 x float> %3357, i64 3 + %3362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3200, <4 x float> %3077, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3201, <4 x float> %3362, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3364 = extractelement <4 x float> %3363, i64 0 + %3365 = extractelement <4 x float> %3363, i64 1 + %3366 = extractelement <4 x float> %3363, i64 2 + %3367 = extractelement <4 x float> %3363, i64 3 + %3368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3200, <4 x float> %3079, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3201, <4 x float> %3368, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3370 = extractelement <4 x float> %3369, i64 0 + %3371 = extractelement <4 x float> %3369, i64 1 + %3372 = extractelement <4 x float> %3369, i64 2 + %3373 = extractelement <4 x float> %3369, i64 3 + %3374 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3262, <4 x float> %3103, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3375 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3263, <4 x float> %3374, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3376 = extractelement <4 x float> %3375, i64 0 + %3377 = extractelement <4 x float> %3375, i64 1 + %3378 = extractelement <4 x float> %3375, i64 2 + %3379 = extractelement <4 x float> %3375, i64 3 + %3380 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3262, <4 x float> %3105, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3381 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3263, <4 x float> %3380, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3382 = extractelement <4 x float> %3381, i64 0 + %3383 = extractelement <4 x float> %3381, i64 1 + %3384 = extractelement <4 x float> %3381, i64 2 + %3385 = extractelement <4 x float> %3381, i64 3 + %3386 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3264, <4 x float> %3107, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3387 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3265, <4 x float> %3386, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3388 = extractelement <4 x float> %3387, i64 0 + %3389 = extractelement <4 x float> %3387, i64 1 + %3390 = extractelement <4 x float> %3387, i64 2 + %3391 = extractelement <4 x float> %3387, i64 3 + %3392 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3264, <4 x float> %3109, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3393 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3265, <4 x float> %3392, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3394 = extractelement <4 x float> %3393, i64 0 + %3395 = extractelement <4 x float> %3393, i64 1 + %3396 = extractelement <4 x float> %3393, i64 2 + %3397 = extractelement <4 x float> %3393, i64 3 + %3398 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3262, <4 x float> %3111, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3399 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3263, <4 x float> %3398, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3400 = extractelement <4 x float> %3399, i64 0 + %3401 = extractelement <4 x float> %3399, i64 1 + %3402 = extractelement <4 x float> %3399, i64 2 + %3403 = extractelement <4 x float> %3399, i64 3 + %3404 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3262, <4 x float> %3113, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3405 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3263, <4 x float> %3404, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3406 = extractelement <4 x float> %3405, i64 0 + %3407 = extractelement <4 x float> %3405, i64 1 + %3408 = extractelement <4 x float> %3405, i64 2 + %3409 = extractelement <4 x float> %3405, i64 3 + %3410 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3264, <4 x float> %3115, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3411 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3265, <4 x float> %3410, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3412 = extractelement <4 x float> %3411, i64 0 + %3413 = extractelement <4 x float> %3411, i64 1 + %3414 = extractelement <4 x float> %3411, i64 2 + %3415 = extractelement <4 x float> %3411, i64 3 + %3416 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3264, <4 x float> %3117, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3417 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3265, <4 x float> %3416, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3418 = extractelement <4 x float> %3417, i64 0 + %3419 = extractelement <4 x float> %3417, i64 1 + %3420 = extractelement <4 x float> %3417, i64 2 + %3421 = extractelement <4 x float> %3417, i64 3 + %3422 = load <8 x half>, ptr addrspace(3) %2858, align 16 + %3423 = load <8 x half>, ptr addrspace(3) %2860, align 16 + %3424 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> + %3425 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> + %3426 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> + %3427 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> + %3428 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3424, <4 x float> %3125, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3429 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3425, <4 x float> %3428, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3430 = extractelement <4 x float> %3429, i64 0 + %3431 = extractelement <4 x float> %3429, i64 1 + %3432 = extractelement <4 x float> %3429, i64 2 + %3433 = extractelement <4 x float> %3429, i64 3 + %3434 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3424, <4 x float> %3127, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3425, <4 x float> %3434, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3436 = extractelement <4 x float> %3435, i64 0 + %3437 = extractelement <4 x float> %3435, i64 1 + %3438 = extractelement <4 x float> %3435, i64 2 + %3439 = extractelement <4 x float> %3435, i64 3 + %3440 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3426, <4 x float> %3129, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3427, <4 x float> %3440, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3442 = extractelement <4 x float> %3441, i64 0 + %3443 = extractelement <4 x float> %3441, i64 1 + %3444 = extractelement <4 x float> %3441, i64 2 + %3445 = extractelement <4 x float> %3441, i64 3 + %3446 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3426, <4 x float> %3131, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3427, <4 x float> %3446, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3448 = extractelement <4 x float> %3447, i64 0 + %3449 = extractelement <4 x float> %3447, i64 1 + %3450 = extractelement <4 x float> %3447, i64 2 + %3451 = extractelement <4 x float> %3447, i64 3 + %3452 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3424, <4 x float> %3133, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3425, <4 x float> %3452, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3454 = extractelement <4 x float> %3453, i64 0 + %3455 = extractelement <4 x float> %3453, i64 1 + %3456 = extractelement <4 x float> %3453, i64 2 + %3457 = extractelement <4 x float> %3453, i64 3 + %3458 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3424, <4 x float> %3135, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3459 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3425, <4 x float> %3458, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3460 = extractelement <4 x float> %3459, i64 0 + %3461 = extractelement <4 x float> %3459, i64 1 + %3462 = extractelement <4 x float> %3459, i64 2 + %3463 = extractelement <4 x float> %3459, i64 3 + %3464 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3426, <4 x float> %3137, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3465 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3427, <4 x float> %3464, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3466 = extractelement <4 x float> %3465, i64 0 + %3467 = extractelement <4 x float> %3465, i64 1 + %3468 = extractelement <4 x float> %3465, i64 2 + %3469 = extractelement <4 x float> %3465, i64 3 + %3470 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3426, <4 x float> %3139, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3471 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3427, <4 x float> %3470, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3472 = extractelement <4 x float> %3471, i64 0 + %3473 = extractelement <4 x float> %3471, i64 1 + %3474 = extractelement <4 x float> %3471, i64 2 + %3475 = extractelement <4 x float> %3471, i64 3 + %3476 = load <8 x half>, ptr addrspace(3) %2882, align 16 + %3477 = load <8 x half>, ptr addrspace(3) %2884, align 16 + %3478 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> + %3479 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> + %3480 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> + %3481 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> + %3482 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3478, <4 x float> %3163, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3483 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3479, <4 x float> %3482, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3484 = extractelement <4 x float> %3483, i64 0 + %3485 = extractelement <4 x float> %3483, i64 1 + %3486 = extractelement <4 x float> %3483, i64 2 + %3487 = extractelement <4 x float> %3483, i64 3 + %3488 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3478, <4 x float> %3165, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3489 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3479, <4 x float> %3488, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3490 = extractelement <4 x float> %3489, i64 0 + %3491 = extractelement <4 x float> %3489, i64 1 + %3492 = extractelement <4 x float> %3489, i64 2 + %3493 = extractelement <4 x float> %3489, i64 3 + %3494 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3480, <4 x float> %3167, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3495 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3481, <4 x float> %3494, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3496 = extractelement <4 x float> %3495, i64 0 + %3497 = extractelement <4 x float> %3495, i64 1 + %3498 = extractelement <4 x float> %3495, i64 2 + %3499 = extractelement <4 x float> %3495, i64 3 + %3500 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3480, <4 x float> %3169, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3501 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3481, <4 x float> %3500, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3502 = extractelement <4 x float> %3501, i64 0 + %3503 = extractelement <4 x float> %3501, i64 1 + %3504 = extractelement <4 x float> %3501, i64 2 + %3505 = extractelement <4 x float> %3501, i64 3 + %3506 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3478, <4 x float> %3171, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3507 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3479, <4 x float> %3506, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3508 = extractelement <4 x float> %3507, i64 0 + %3509 = extractelement <4 x float> %3507, i64 1 + %3510 = extractelement <4 x float> %3507, i64 2 + %3511 = extractelement <4 x float> %3507, i64 3 + %3512 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3478, <4 x float> %3173, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3513 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3479, <4 x float> %3512, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3514 = extractelement <4 x float> %3513, i64 0 + %3515 = extractelement <4 x float> %3513, i64 1 + %3516 = extractelement <4 x float> %3513, i64 2 + %3517 = extractelement <4 x float> %3513, i64 3 + %3518 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3480, <4 x float> %3175, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3519 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3481, <4 x float> %3518, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3520 = extractelement <4 x float> %3519, i64 0 + %3521 = extractelement <4 x float> %3519, i64 1 + %3522 = extractelement <4 x float> %3519, i64 2 + %3523 = extractelement <4 x float> %3519, i64 3 + %3524 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3480, <4 x float> %3177, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3525 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3481, <4 x float> %3524, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3526 = extractelement <4 x float> %3525, i64 0 + %3527 = extractelement <4 x float> %3525, i64 1 + %3528 = extractelement <4 x float> %3525, i64 2 + %3529 = extractelement <4 x float> %3525, i64 3 + %3530 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3424, <4 x float> %3141, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3531 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3425, <4 x float> %3530, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3532 = extractelement <4 x float> %3531, i64 0 + %3533 = extractelement <4 x float> %3531, i64 1 + %3534 = extractelement <4 x float> %3531, i64 2 + %3535 = extractelement <4 x float> %3531, i64 3 + %3536 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3424, <4 x float> %3143, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3537 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3425, <4 x float> %3536, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3538 = extractelement <4 x float> %3537, i64 0 + %3539 = extractelement <4 x float> %3537, i64 1 + %3540 = extractelement <4 x float> %3537, i64 2 + %3541 = extractelement <4 x float> %3537, i64 3 + %3542 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3426, <4 x float> %3145, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3543 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3427, <4 x float> %3542, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3544 = extractelement <4 x float> %3543, i64 0 + %3545 = extractelement <4 x float> %3543, i64 1 + %3546 = extractelement <4 x float> %3543, i64 2 + %3547 = extractelement <4 x float> %3543, i64 3 + %3548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3426, <4 x float> %3147, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3427, <4 x float> %3548, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3550 = extractelement <4 x float> %3549, i64 0 + %3551 = extractelement <4 x float> %3549, i64 1 + %3552 = extractelement <4 x float> %3549, i64 2 + %3553 = extractelement <4 x float> %3549, i64 3 + %3554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3424, <4 x float> %3149, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3425, <4 x float> %3554, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3556 = extractelement <4 x float> %3555, i64 0 + %3557 = extractelement <4 x float> %3555, i64 1 + %3558 = extractelement <4 x float> %3555, i64 2 + %3559 = extractelement <4 x float> %3555, i64 3 + %3560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3424, <4 x float> %3151, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3425, <4 x float> %3560, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3562 = extractelement <4 x float> %3561, i64 0 + %3563 = extractelement <4 x float> %3561, i64 1 + %3564 = extractelement <4 x float> %3561, i64 2 + %3565 = extractelement <4 x float> %3561, i64 3 + %3566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3426, <4 x float> %3153, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3427, <4 x float> %3566, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3568 = extractelement <4 x float> %3567, i64 0 + %3569 = extractelement <4 x float> %3567, i64 1 + %3570 = extractelement <4 x float> %3567, i64 2 + %3571 = extractelement <4 x float> %3567, i64 3 + %3572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3426, <4 x float> %3155, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3427, <4 x float> %3572, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3574 = extractelement <4 x float> %3573, i64 0 + %3575 = extractelement <4 x float> %3573, i64 1 + %3576 = extractelement <4 x float> %3573, i64 2 + %3577 = extractelement <4 x float> %3573, i64 3 + %3578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3478, <4 x float> %3179, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3479, <4 x float> %3578, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3580 = extractelement <4 x float> %3579, i64 0 + %3581 = extractelement <4 x float> %3579, i64 1 + %3582 = extractelement <4 x float> %3579, i64 2 + %3583 = extractelement <4 x float> %3579, i64 3 + %3584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3478, <4 x float> %3181, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3479, <4 x float> %3584, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3586 = extractelement <4 x float> %3585, i64 0 + %3587 = extractelement <4 x float> %3585, i64 1 + %3588 = extractelement <4 x float> %3585, i64 2 + %3589 = extractelement <4 x float> %3585, i64 3 + %3590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3480, <4 x float> %3183, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3481, <4 x float> %3590, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3592 = extractelement <4 x float> %3591, i64 0 + %3593 = extractelement <4 x float> %3591, i64 1 + %3594 = extractelement <4 x float> %3591, i64 2 + %3595 = extractelement <4 x float> %3591, i64 3 + %3596 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3480, <4 x float> %3185, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3597 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3481, <4 x float> %3596, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3598 = extractelement <4 x float> %3597, i64 0 + %3599 = extractelement <4 x float> %3597, i64 1 + %3600 = extractelement <4 x float> %3597, i64 2 + %3601 = extractelement <4 x float> %3597, i64 3 + %3602 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3478, <4 x float> %3187, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3603 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3479, <4 x float> %3602, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3604 = extractelement <4 x float> %3603, i64 0 + %3605 = extractelement <4 x float> %3603, i64 1 + %3606 = extractelement <4 x float> %3603, i64 2 + %3607 = extractelement <4 x float> %3603, i64 3 + %3608 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3478, <4 x float> %3189, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3609 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3479, <4 x float> %3608, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3610 = extractelement <4 x float> %3609, i64 0 + %3611 = extractelement <4 x float> %3609, i64 1 + %3612 = extractelement <4 x float> %3609, i64 2 + %3613 = extractelement <4 x float> %3609, i64 3 + %3614 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3480, <4 x float> %3191, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3615 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3481, <4 x float> %3614, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3616 = extractelement <4 x float> %3615, i64 0 + %3617 = extractelement <4 x float> %3615, i64 1 + %3618 = extractelement <4 x float> %3615, i64 2 + %3619 = extractelement <4 x float> %3615, i64 3 + %3620 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3480, <4 x float> %3193, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + %3621 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3481, <4 x float> %3620, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 2038) + tail call void @llvm.amdgcn.sched.barrier(i32 1030) + %3622 = extractelement <4 x float> %3621, i64 0 + %3623 = extractelement <4 x float> %3621, i64 1 + %3624 = extractelement <4 x float> %3621, i64 2 + %3625 = extractelement <4 x float> %3621, i64 3 + %3626 = mul i32 %31, %9 + %3627 = sext i32 %3626 to i64 + %3628 = getelementptr half, ptr addrspace(1) %2, i64 %3627 + %3629 = sext i32 %118 to i64 + %3630 = getelementptr half, ptr addrspace(1) %3628, i64 %3629 + %3631 = mul i32 %9, %225 + %3632 = mul i32 %9, %2309 + %3633 = mul i32 %9, %2308 + %3634 = mul i32 %9, %2307 + %3635 = mul i32 %9, %2306 + %3636 = mul i32 %9, %2305 + %3637 = mul i32 %9, %2304 + %3638 = mul i32 %9, %2303 + tail call void @llvm.amdgcn.sched.barrier(i32 1030) + %3639 = add i32 %3631, %2295 + %3640 = add i32 %3631, %2302 + %3641 = add i32 %3632, %2295 + %3642 = add i32 %3632, %2302 + %3643 = fptrunc float %3208 to half + %3644 = fptrunc float %3209 to half + %3645 = fptrunc float %3210 to half + %3646 = fptrunc float %3211 to half + %3647 = fptrunc float %3214 to half + %3648 = fptrunc float %3215 to half + %3649 = fptrunc float %3216 to half + %3650 = fptrunc float %3217 to half + %3651 = fptrunc float %3220 to half + %3652 = fptrunc float %3221 to half + %3653 = fptrunc float %3222 to half + %3654 = fptrunc float %3223 to half + %3655 = fptrunc float %3226 to half + %3656 = fptrunc float %3227 to half + %3657 = fptrunc float %3228 to half + %3658 = fptrunc float %3229 to half + %3659 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %3630, i16 0, i32 2147483646, i32 159744) + %3660 = insertelement <4 x half> poison, half %3643, i64 0 + %3661 = insertelement <4 x half> %3660, half %3644, i64 1 + %3662 = insertelement <4 x half> %3661, half %3645, i64 2 + %3663 = insertelement <4 x half> %3662, half %3646, i64 3 + %3664 = bitcast <4 x half> %3663 to <2 x i32> + %3665 = shl i32 %3639, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3664, ptr addrspace(8) %3659, i32 %3665, i32 0, i32 0) + %3666 = insertelement <4 x half> poison, half %3647, i64 0 + %3667 = insertelement <4 x half> %3666, half %3648, i64 1 + %3668 = insertelement <4 x half> %3667, half %3649, i64 2 + %3669 = insertelement <4 x half> %3668, half %3650, i64 3 + %3670 = bitcast <4 x half> %3669 to <2 x i32> + %3671 = shl i32 %3640, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3670, ptr addrspace(8) %3659, i32 %3671, i32 0, i32 0) + %3672 = insertelement <4 x half> poison, half %3651, i64 0 + %3673 = insertelement <4 x half> %3672, half %3652, i64 1 + %3674 = insertelement <4 x half> %3673, half %3653, i64 2 + %3675 = insertelement <4 x half> %3674, half %3654, i64 3 + %3676 = bitcast <4 x half> %3675 to <2 x i32> + %3677 = shl i32 %3641, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3676, ptr addrspace(8) %3659, i32 %3677, i32 0, i32 0) + %3678 = insertelement <4 x half> poison, half %3655, i64 0 + %3679 = insertelement <4 x half> %3678, half %3656, i64 1 + %3680 = insertelement <4 x half> %3679, half %3657, i64 2 + %3681 = insertelement <4 x half> %3680, half %3658, i64 3 + %3682 = bitcast <4 x half> %3681 to <2 x i32> + %3683 = shl i32 %3642, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3682, ptr addrspace(8) %3659, i32 %3683, i32 0, i32 0) + %3684 = add i32 %3631, %2301 + %3685 = add i32 %3631, %2300 + %3686 = add i32 %3632, %2301 + %3687 = add i32 %3632, %2300 + %3688 = fptrunc float %3238 to half + %3689 = fptrunc float %3239 to half + %3690 = fptrunc float %3240 to half + %3691 = fptrunc float %3241 to half + %3692 = fptrunc float %3244 to half + %3693 = fptrunc float %3245 to half + %3694 = fptrunc float %3246 to half + %3695 = fptrunc float %3247 to half + %3696 = fptrunc float %3250 to half + %3697 = fptrunc float %3251 to half + %3698 = fptrunc float %3252 to half + %3699 = fptrunc float %3253 to half + %3700 = fptrunc float %3256 to half + %3701 = fptrunc float %3257 to half + %3702 = fptrunc float %3258 to half + %3703 = fptrunc float %3259 to half + %3704 = insertelement <4 x half> poison, half %3688, i64 0 + %3705 = insertelement <4 x half> %3704, half %3689, i64 1 + %3706 = insertelement <4 x half> %3705, half %3690, i64 2 + %3707 = insertelement <4 x half> %3706, half %3691, i64 3 + %3708 = bitcast <4 x half> %3707 to <2 x i32> + %3709 = shl i32 %3684, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3708, ptr addrspace(8) %3659, i32 %3709, i32 0, i32 0) + %3710 = insertelement <4 x half> poison, half %3692, i64 0 + %3711 = insertelement <4 x half> %3710, half %3693, i64 1 + %3712 = insertelement <4 x half> %3711, half %3694, i64 2 + %3713 = insertelement <4 x half> %3712, half %3695, i64 3 + %3714 = bitcast <4 x half> %3713 to <2 x i32> + %3715 = shl i32 %3685, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3714, ptr addrspace(8) %3659, i32 %3715, i32 0, i32 0) + %3716 = insertelement <4 x half> poison, half %3696, i64 0 + %3717 = insertelement <4 x half> %3716, half %3697, i64 1 + %3718 = insertelement <4 x half> %3717, half %3698, i64 2 + %3719 = insertelement <4 x half> %3718, half %3699, i64 3 + %3720 = bitcast <4 x half> %3719 to <2 x i32> + %3721 = shl i32 %3686, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3720, ptr addrspace(8) %3659, i32 %3721, i32 0, i32 0) + %3722 = insertelement <4 x half> poison, half %3700, i64 0 + %3723 = insertelement <4 x half> %3722, half %3701, i64 1 + %3724 = insertelement <4 x half> %3723, half %3702, i64 2 + %3725 = insertelement <4 x half> %3724, half %3703, i64 3 + %3726 = bitcast <4 x half> %3725 to <2 x i32> + %3727 = shl i32 %3687, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3726, ptr addrspace(8) %3659, i32 %3727, i32 0, i32 0) + %3728 = add i32 %3631, %2299 + %3729 = add i32 %3631, %2298 + %3730 = add i32 %3632, %2299 + %3731 = add i32 %3632, %2298 + %3732 = fptrunc float %3322 to half + %3733 = fptrunc float %3323 to half + %3734 = fptrunc float %3324 to half + %3735 = fptrunc float %3325 to half + %3736 = fptrunc float %3328 to half + %3737 = fptrunc float %3329 to half + %3738 = fptrunc float %3330 to half + %3739 = fptrunc float %3331 to half + %3740 = fptrunc float %3334 to half + %3741 = fptrunc float %3335 to half + %3742 = fptrunc float %3336 to half + %3743 = fptrunc float %3337 to half + %3744 = fptrunc float %3340 to half + %3745 = fptrunc float %3341 to half + %3746 = fptrunc float %3342 to half + %3747 = fptrunc float %3343 to half + %3748 = insertelement <4 x half> poison, half %3732, i64 0 + %3749 = insertelement <4 x half> %3748, half %3733, i64 1 + %3750 = insertelement <4 x half> %3749, half %3734, i64 2 + %3751 = insertelement <4 x half> %3750, half %3735, i64 3 + %3752 = bitcast <4 x half> %3751 to <2 x i32> + %3753 = shl i32 %3728, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3752, ptr addrspace(8) %3659, i32 %3753, i32 0, i32 0) + %3754 = insertelement <4 x half> poison, half %3736, i64 0 + %3755 = insertelement <4 x half> %3754, half %3737, i64 1 + %3756 = insertelement <4 x half> %3755, half %3738, i64 2 + %3757 = insertelement <4 x half> %3756, half %3739, i64 3 + %3758 = bitcast <4 x half> %3757 to <2 x i32> + %3759 = shl i32 %3729, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3758, ptr addrspace(8) %3659, i32 %3759, i32 0, i32 0) + %3760 = insertelement <4 x half> poison, half %3740, i64 0 + %3761 = insertelement <4 x half> %3760, half %3741, i64 1 + %3762 = insertelement <4 x half> %3761, half %3742, i64 2 + %3763 = insertelement <4 x half> %3762, half %3743, i64 3 + %3764 = bitcast <4 x half> %3763 to <2 x i32> + %3765 = shl i32 %3730, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3764, ptr addrspace(8) %3659, i32 %3765, i32 0, i32 0) + %3766 = insertelement <4 x half> poison, half %3744, i64 0 + %3767 = insertelement <4 x half> %3766, half %3745, i64 1 + %3768 = insertelement <4 x half> %3767, half %3746, i64 2 + %3769 = insertelement <4 x half> %3768, half %3747, i64 3 + %3770 = bitcast <4 x half> %3769 to <2 x i32> + %3771 = shl i32 %3731, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3770, ptr addrspace(8) %3659, i32 %3771, i32 0, i32 0) + %3772 = add i32 %3631, %2297 + %3773 = add i32 %3631, %2296 + %3774 = add i32 %3632, %2297 + %3775 = add i32 %3632, %2296 + %3776 = fptrunc float %3352 to half + %3777 = fptrunc float %3353 to half + %3778 = fptrunc float %3354 to half + %3779 = fptrunc float %3355 to half + %3780 = fptrunc float %3358 to half + %3781 = fptrunc float %3359 to half + %3782 = fptrunc float %3360 to half + %3783 = fptrunc float %3361 to half + %3784 = fptrunc float %3364 to half + %3785 = fptrunc float %3365 to half + %3786 = fptrunc float %3366 to half + %3787 = fptrunc float %3367 to half + %3788 = fptrunc float %3370 to half + %3789 = fptrunc float %3371 to half + %3790 = fptrunc float %3372 to half + %3791 = fptrunc float %3373 to half + %3792 = insertelement <4 x half> poison, half %3776, i64 0 + %3793 = insertelement <4 x half> %3792, half %3777, i64 1 + %3794 = insertelement <4 x half> %3793, half %3778, i64 2 + %3795 = insertelement <4 x half> %3794, half %3779, i64 3 + %3796 = bitcast <4 x half> %3795 to <2 x i32> + %3797 = shl i32 %3772, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3796, ptr addrspace(8) %3659, i32 %3797, i32 0, i32 0) + %3798 = insertelement <4 x half> poison, half %3780, i64 0 + %3799 = insertelement <4 x half> %3798, half %3781, i64 1 + %3800 = insertelement <4 x half> %3799, half %3782, i64 2 + %3801 = insertelement <4 x half> %3800, half %3783, i64 3 + %3802 = bitcast <4 x half> %3801 to <2 x i32> + %3803 = shl i32 %3773, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3802, ptr addrspace(8) %3659, i32 %3803, i32 0, i32 0) + %3804 = insertelement <4 x half> poison, half %3784, i64 0 + %3805 = insertelement <4 x half> %3804, half %3785, i64 1 + %3806 = insertelement <4 x half> %3805, half %3786, i64 2 + %3807 = insertelement <4 x half> %3806, half %3787, i64 3 + %3808 = bitcast <4 x half> %3807 to <2 x i32> + %3809 = shl i32 %3774, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3808, ptr addrspace(8) %3659, i32 %3809, i32 0, i32 0) + %3810 = insertelement <4 x half> poison, half %3788, i64 0 + %3811 = insertelement <4 x half> %3810, half %3789, i64 1 + %3812 = insertelement <4 x half> %3811, half %3790, i64 2 + %3813 = insertelement <4 x half> %3812, half %3791, i64 3 + %3814 = bitcast <4 x half> %3813 to <2 x i32> + %3815 = shl i32 %3775, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3814, ptr addrspace(8) %3659, i32 %3815, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 1030) + %3816 = add i32 %3633, %2295 + %3817 = add i32 %3633, %2302 + %3818 = add i32 %3634, %2295 + %3819 = add i32 %3634, %2302 + %3820 = fptrunc float %3268 to half + %3821 = fptrunc float %3269 to half + %3822 = fptrunc float %3270 to half + %3823 = fptrunc float %3271 to half + %3824 = fptrunc float %3274 to half + %3825 = fptrunc float %3275 to half + %3826 = fptrunc float %3276 to half + %3827 = fptrunc float %3277 to half + %3828 = fptrunc float %3280 to half + %3829 = fptrunc float %3281 to half + %3830 = fptrunc float %3282 to half + %3831 = fptrunc float %3283 to half + %3832 = fptrunc float %3286 to half + %3833 = fptrunc float %3287 to half + %3834 = fptrunc float %3288 to half + %3835 = fptrunc float %3289 to half + %3836 = insertelement <4 x half> poison, half %3820, i64 0 + %3837 = insertelement <4 x half> %3836, half %3821, i64 1 + %3838 = insertelement <4 x half> %3837, half %3822, i64 2 + %3839 = insertelement <4 x half> %3838, half %3823, i64 3 + %3840 = bitcast <4 x half> %3839 to <2 x i32> + %3841 = shl i32 %3816, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3840, ptr addrspace(8) %3659, i32 %3841, i32 0, i32 0) + %3842 = insertelement <4 x half> poison, half %3824, i64 0 + %3843 = insertelement <4 x half> %3842, half %3825, i64 1 + %3844 = insertelement <4 x half> %3843, half %3826, i64 2 + %3845 = insertelement <4 x half> %3844, half %3827, i64 3 + %3846 = bitcast <4 x half> %3845 to <2 x i32> + %3847 = shl i32 %3817, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3846, ptr addrspace(8) %3659, i32 %3847, i32 0, i32 0) + %3848 = insertelement <4 x half> poison, half %3828, i64 0 + %3849 = insertelement <4 x half> %3848, half %3829, i64 1 + %3850 = insertelement <4 x half> %3849, half %3830, i64 2 + %3851 = insertelement <4 x half> %3850, half %3831, i64 3 + %3852 = bitcast <4 x half> %3851 to <2 x i32> + %3853 = shl i32 %3818, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3852, ptr addrspace(8) %3659, i32 %3853, i32 0, i32 0) + %3854 = insertelement <4 x half> poison, half %3832, i64 0 + %3855 = insertelement <4 x half> %3854, half %3833, i64 1 + %3856 = insertelement <4 x half> %3855, half %3834, i64 2 + %3857 = insertelement <4 x half> %3856, half %3835, i64 3 + %3858 = bitcast <4 x half> %3857 to <2 x i32> + %3859 = shl i32 %3819, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3858, ptr addrspace(8) %3659, i32 %3859, i32 0, i32 0) + %3860 = add i32 %3633, %2301 + %3861 = add i32 %3633, %2300 + %3862 = add i32 %3634, %2301 + %3863 = add i32 %3634, %2300 + %3864 = fptrunc float %3292 to half + %3865 = fptrunc float %3293 to half + %3866 = fptrunc float %3294 to half + %3867 = fptrunc float %3295 to half + %3868 = fptrunc float %3298 to half + %3869 = fptrunc float %3299 to half + %3870 = fptrunc float %3300 to half + %3871 = fptrunc float %3301 to half + %3872 = fptrunc float %3304 to half + %3873 = fptrunc float %3305 to half + %3874 = fptrunc float %3306 to half + %3875 = fptrunc float %3307 to half + %3876 = fptrunc float %3310 to half + %3877 = fptrunc float %3311 to half + %3878 = fptrunc float %3312 to half + %3879 = fptrunc float %3313 to half + %3880 = insertelement <4 x half> poison, half %3864, i64 0 + %3881 = insertelement <4 x half> %3880, half %3865, i64 1 + %3882 = insertelement <4 x half> %3881, half %3866, i64 2 + %3883 = insertelement <4 x half> %3882, half %3867, i64 3 + %3884 = bitcast <4 x half> %3883 to <2 x i32> + %3885 = shl i32 %3860, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3884, ptr addrspace(8) %3659, i32 %3885, i32 0, i32 0) + %3886 = insertelement <4 x half> poison, half %3868, i64 0 + %3887 = insertelement <4 x half> %3886, half %3869, i64 1 + %3888 = insertelement <4 x half> %3887, half %3870, i64 2 + %3889 = insertelement <4 x half> %3888, half %3871, i64 3 + %3890 = bitcast <4 x half> %3889 to <2 x i32> + %3891 = shl i32 %3861, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3890, ptr addrspace(8) %3659, i32 %3891, i32 0, i32 0) + %3892 = insertelement <4 x half> poison, half %3872, i64 0 + %3893 = insertelement <4 x half> %3892, half %3873, i64 1 + %3894 = insertelement <4 x half> %3893, half %3874, i64 2 + %3895 = insertelement <4 x half> %3894, half %3875, i64 3 + %3896 = bitcast <4 x half> %3895 to <2 x i32> + %3897 = shl i32 %3862, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3896, ptr addrspace(8) %3659, i32 %3897, i32 0, i32 0) + %3898 = insertelement <4 x half> poison, half %3876, i64 0 + %3899 = insertelement <4 x half> %3898, half %3877, i64 1 + %3900 = insertelement <4 x half> %3899, half %3878, i64 2 + %3901 = insertelement <4 x half> %3900, half %3879, i64 3 + %3902 = bitcast <4 x half> %3901 to <2 x i32> + %3903 = shl i32 %3863, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3902, ptr addrspace(8) %3659, i32 %3903, i32 0, i32 0) + %3904 = add i32 %3633, %2299 + %3905 = add i32 %3633, %2298 + %3906 = add i32 %3634, %2299 + %3907 = add i32 %3634, %2298 + %3908 = fptrunc float %3376 to half + %3909 = fptrunc float %3377 to half + %3910 = fptrunc float %3378 to half + %3911 = fptrunc float %3379 to half + %3912 = fptrunc float %3382 to half + %3913 = fptrunc float %3383 to half + %3914 = fptrunc float %3384 to half + %3915 = fptrunc float %3385 to half + %3916 = fptrunc float %3388 to half + %3917 = fptrunc float %3389 to half + %3918 = fptrunc float %3390 to half + %3919 = fptrunc float %3391 to half + %3920 = fptrunc float %3394 to half + %3921 = fptrunc float %3395 to half + %3922 = fptrunc float %3396 to half + %3923 = fptrunc float %3397 to half + %3924 = insertelement <4 x half> poison, half %3908, i64 0 + %3925 = insertelement <4 x half> %3924, half %3909, i64 1 + %3926 = insertelement <4 x half> %3925, half %3910, i64 2 + %3927 = insertelement <4 x half> %3926, half %3911, i64 3 + %3928 = bitcast <4 x half> %3927 to <2 x i32> + %3929 = shl i32 %3904, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3928, ptr addrspace(8) %3659, i32 %3929, i32 0, i32 0) + %3930 = insertelement <4 x half> poison, half %3912, i64 0 + %3931 = insertelement <4 x half> %3930, half %3913, i64 1 + %3932 = insertelement <4 x half> %3931, half %3914, i64 2 + %3933 = insertelement <4 x half> %3932, half %3915, i64 3 + %3934 = bitcast <4 x half> %3933 to <2 x i32> + %3935 = shl i32 %3905, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3934, ptr addrspace(8) %3659, i32 %3935, i32 0, i32 0) + %3936 = insertelement <4 x half> poison, half %3916, i64 0 + %3937 = insertelement <4 x half> %3936, half %3917, i64 1 + %3938 = insertelement <4 x half> %3937, half %3918, i64 2 + %3939 = insertelement <4 x half> %3938, half %3919, i64 3 + %3940 = bitcast <4 x half> %3939 to <2 x i32> + %3941 = shl i32 %3906, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3940, ptr addrspace(8) %3659, i32 %3941, i32 0, i32 0) + %3942 = insertelement <4 x half> poison, half %3920, i64 0 + %3943 = insertelement <4 x half> %3942, half %3921, i64 1 + %3944 = insertelement <4 x half> %3943, half %3922, i64 2 + %3945 = insertelement <4 x half> %3944, half %3923, i64 3 + %3946 = bitcast <4 x half> %3945 to <2 x i32> + %3947 = shl i32 %3907, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3946, ptr addrspace(8) %3659, i32 %3947, i32 0, i32 0) + %3948 = add i32 %3633, %2297 + %3949 = add i32 %3633, %2296 + %3950 = add i32 %3634, %2297 + %3951 = add i32 %3634, %2296 + %3952 = fptrunc float %3400 to half + %3953 = fptrunc float %3401 to half + %3954 = fptrunc float %3402 to half + %3955 = fptrunc float %3403 to half + %3956 = fptrunc float %3406 to half + %3957 = fptrunc float %3407 to half + %3958 = fptrunc float %3408 to half + %3959 = fptrunc float %3409 to half + %3960 = fptrunc float %3412 to half + %3961 = fptrunc float %3413 to half + %3962 = fptrunc float %3414 to half + %3963 = fptrunc float %3415 to half + %3964 = fptrunc float %3418 to half + %3965 = fptrunc float %3419 to half + %3966 = fptrunc float %3420 to half + %3967 = fptrunc float %3421 to half + %3968 = insertelement <4 x half> poison, half %3952, i64 0 + %3969 = insertelement <4 x half> %3968, half %3953, i64 1 + %3970 = insertelement <4 x half> %3969, half %3954, i64 2 + %3971 = insertelement <4 x half> %3970, half %3955, i64 3 + %3972 = bitcast <4 x half> %3971 to <2 x i32> + %3973 = shl i32 %3948, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3972, ptr addrspace(8) %3659, i32 %3973, i32 0, i32 0) + %3974 = insertelement <4 x half> poison, half %3956, i64 0 + %3975 = insertelement <4 x half> %3974, half %3957, i64 1 + %3976 = insertelement <4 x half> %3975, half %3958, i64 2 + %3977 = insertelement <4 x half> %3976, half %3959, i64 3 + %3978 = bitcast <4 x half> %3977 to <2 x i32> + %3979 = shl i32 %3949, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3978, ptr addrspace(8) %3659, i32 %3979, i32 0, i32 0) + %3980 = insertelement <4 x half> poison, half %3960, i64 0 + %3981 = insertelement <4 x half> %3980, half %3961, i64 1 + %3982 = insertelement <4 x half> %3981, half %3962, i64 2 + %3983 = insertelement <4 x half> %3982, half %3963, i64 3 + %3984 = bitcast <4 x half> %3983 to <2 x i32> + %3985 = shl i32 %3950, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3984, ptr addrspace(8) %3659, i32 %3985, i32 0, i32 0) + %3986 = insertelement <4 x half> poison, half %3964, i64 0 + %3987 = insertelement <4 x half> %3986, half %3965, i64 1 + %3988 = insertelement <4 x half> %3987, half %3966, i64 2 + %3989 = insertelement <4 x half> %3988, half %3967, i64 3 + %3990 = bitcast <4 x half> %3989 to <2 x i32> + %3991 = shl i32 %3951, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3990, ptr addrspace(8) %3659, i32 %3991, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 1030) + %3992 = add i32 %3635, %2295 + %3993 = add i32 %3635, %2302 + %3994 = add i32 %3636, %2295 + %3995 = add i32 %3636, %2302 + %3996 = fptrunc float %3430 to half + %3997 = fptrunc float %3431 to half + %3998 = fptrunc float %3432 to half + %3999 = fptrunc float %3433 to half + %4000 = fptrunc float %3436 to half + %4001 = fptrunc float %3437 to half + %4002 = fptrunc float %3438 to half + %4003 = fptrunc float %3439 to half + %4004 = fptrunc float %3442 to half + %4005 = fptrunc float %3443 to half + %4006 = fptrunc float %3444 to half + %4007 = fptrunc float %3445 to half + %4008 = fptrunc float %3448 to half + %4009 = fptrunc float %3449 to half + %4010 = fptrunc float %3450 to half + %4011 = fptrunc float %3451 to half + %4012 = insertelement <4 x half> poison, half %3996, i64 0 + %4013 = insertelement <4 x half> %4012, half %3997, i64 1 + %4014 = insertelement <4 x half> %4013, half %3998, i64 2 + %4015 = insertelement <4 x half> %4014, half %3999, i64 3 + %4016 = bitcast <4 x half> %4015 to <2 x i32> + %4017 = shl i32 %3992, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4016, ptr addrspace(8) %3659, i32 %4017, i32 0, i32 0) + %4018 = insertelement <4 x half> poison, half %4000, i64 0 + %4019 = insertelement <4 x half> %4018, half %4001, i64 1 + %4020 = insertelement <4 x half> %4019, half %4002, i64 2 + %4021 = insertelement <4 x half> %4020, half %4003, i64 3 + %4022 = bitcast <4 x half> %4021 to <2 x i32> + %4023 = shl i32 %3993, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4022, ptr addrspace(8) %3659, i32 %4023, i32 0, i32 0) + %4024 = insertelement <4 x half> poison, half %4004, i64 0 + %4025 = insertelement <4 x half> %4024, half %4005, i64 1 + %4026 = insertelement <4 x half> %4025, half %4006, i64 2 + %4027 = insertelement <4 x half> %4026, half %4007, i64 3 + %4028 = bitcast <4 x half> %4027 to <2 x i32> + %4029 = shl i32 %3994, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4028, ptr addrspace(8) %3659, i32 %4029, i32 0, i32 0) + %4030 = insertelement <4 x half> poison, half %4008, i64 0 + %4031 = insertelement <4 x half> %4030, half %4009, i64 1 + %4032 = insertelement <4 x half> %4031, half %4010, i64 2 + %4033 = insertelement <4 x half> %4032, half %4011, i64 3 + %4034 = bitcast <4 x half> %4033 to <2 x i32> + %4035 = shl i32 %3995, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4034, ptr addrspace(8) %3659, i32 %4035, i32 0, i32 0) + %4036 = add i32 %3635, %2301 + %4037 = add i32 %3635, %2300 + %4038 = add i32 %3636, %2301 + %4039 = add i32 %3636, %2300 + %4040 = fptrunc float %3454 to half + %4041 = fptrunc float %3455 to half + %4042 = fptrunc float %3456 to half + %4043 = fptrunc float %3457 to half + %4044 = fptrunc float %3460 to half + %4045 = fptrunc float %3461 to half + %4046 = fptrunc float %3462 to half + %4047 = fptrunc float %3463 to half + %4048 = fptrunc float %3466 to half + %4049 = fptrunc float %3467 to half + %4050 = fptrunc float %3468 to half + %4051 = fptrunc float %3469 to half + %4052 = fptrunc float %3472 to half + %4053 = fptrunc float %3473 to half + %4054 = fptrunc float %3474 to half + %4055 = fptrunc float %3475 to half + %4056 = insertelement <4 x half> poison, half %4040, i64 0 + %4057 = insertelement <4 x half> %4056, half %4041, i64 1 + %4058 = insertelement <4 x half> %4057, half %4042, i64 2 + %4059 = insertelement <4 x half> %4058, half %4043, i64 3 + %4060 = bitcast <4 x half> %4059 to <2 x i32> + %4061 = shl i32 %4036, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4060, ptr addrspace(8) %3659, i32 %4061, i32 0, i32 0) + %4062 = insertelement <4 x half> poison, half %4044, i64 0 + %4063 = insertelement <4 x half> %4062, half %4045, i64 1 + %4064 = insertelement <4 x half> %4063, half %4046, i64 2 + %4065 = insertelement <4 x half> %4064, half %4047, i64 3 + %4066 = bitcast <4 x half> %4065 to <2 x i32> + %4067 = shl i32 %4037, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4066, ptr addrspace(8) %3659, i32 %4067, i32 0, i32 0) + %4068 = insertelement <4 x half> poison, half %4048, i64 0 + %4069 = insertelement <4 x half> %4068, half %4049, i64 1 + %4070 = insertelement <4 x half> %4069, half %4050, i64 2 + %4071 = insertelement <4 x half> %4070, half %4051, i64 3 + %4072 = bitcast <4 x half> %4071 to <2 x i32> + %4073 = shl i32 %4038, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4072, ptr addrspace(8) %3659, i32 %4073, i32 0, i32 0) + %4074 = insertelement <4 x half> poison, half %4052, i64 0 + %4075 = insertelement <4 x half> %4074, half %4053, i64 1 + %4076 = insertelement <4 x half> %4075, half %4054, i64 2 + %4077 = insertelement <4 x half> %4076, half %4055, i64 3 + %4078 = bitcast <4 x half> %4077 to <2 x i32> + %4079 = shl i32 %4039, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4078, ptr addrspace(8) %3659, i32 %4079, i32 0, i32 0) + %4080 = add i32 %3635, %2299 + %4081 = add i32 %3635, %2298 + %4082 = add i32 %3636, %2299 + %4083 = add i32 %3636, %2298 + %4084 = fptrunc float %3532 to half + %4085 = fptrunc float %3533 to half + %4086 = fptrunc float %3534 to half + %4087 = fptrunc float %3535 to half + %4088 = fptrunc float %3538 to half + %4089 = fptrunc float %3539 to half + %4090 = fptrunc float %3540 to half + %4091 = fptrunc float %3541 to half + %4092 = fptrunc float %3544 to half + %4093 = fptrunc float %3545 to half + %4094 = fptrunc float %3546 to half + %4095 = fptrunc float %3547 to half + %4096 = fptrunc float %3550 to half + %4097 = fptrunc float %3551 to half + %4098 = fptrunc float %3552 to half + %4099 = fptrunc float %3553 to half + %4100 = insertelement <4 x half> poison, half %4084, i64 0 + %4101 = insertelement <4 x half> %4100, half %4085, i64 1 + %4102 = insertelement <4 x half> %4101, half %4086, i64 2 + %4103 = insertelement <4 x half> %4102, half %4087, i64 3 + %4104 = bitcast <4 x half> %4103 to <2 x i32> + %4105 = shl i32 %4080, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4104, ptr addrspace(8) %3659, i32 %4105, i32 0, i32 0) + %4106 = insertelement <4 x half> poison, half %4088, i64 0 + %4107 = insertelement <4 x half> %4106, half %4089, i64 1 + %4108 = insertelement <4 x half> %4107, half %4090, i64 2 + %4109 = insertelement <4 x half> %4108, half %4091, i64 3 + %4110 = bitcast <4 x half> %4109 to <2 x i32> + %4111 = shl i32 %4081, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4110, ptr addrspace(8) %3659, i32 %4111, i32 0, i32 0) + %4112 = insertelement <4 x half> poison, half %4092, i64 0 + %4113 = insertelement <4 x half> %4112, half %4093, i64 1 + %4114 = insertelement <4 x half> %4113, half %4094, i64 2 + %4115 = insertelement <4 x half> %4114, half %4095, i64 3 + %4116 = bitcast <4 x half> %4115 to <2 x i32> + %4117 = shl i32 %4082, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4116, ptr addrspace(8) %3659, i32 %4117, i32 0, i32 0) + %4118 = insertelement <4 x half> poison, half %4096, i64 0 + %4119 = insertelement <4 x half> %4118, half %4097, i64 1 + %4120 = insertelement <4 x half> %4119, half %4098, i64 2 + %4121 = insertelement <4 x half> %4120, half %4099, i64 3 + %4122 = bitcast <4 x half> %4121 to <2 x i32> + %4123 = shl i32 %4083, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4122, ptr addrspace(8) %3659, i32 %4123, i32 0, i32 0) + %4124 = add i32 %3635, %2297 + %4125 = add i32 %3635, %2296 + %4126 = add i32 %3636, %2297 + %4127 = add i32 %3636, %2296 + %4128 = fptrunc float %3556 to half + %4129 = fptrunc float %3557 to half + %4130 = fptrunc float %3558 to half + %4131 = fptrunc float %3559 to half + %4132 = fptrunc float %3562 to half + %4133 = fptrunc float %3563 to half + %4134 = fptrunc float %3564 to half + %4135 = fptrunc float %3565 to half + %4136 = fptrunc float %3568 to half + %4137 = fptrunc float %3569 to half + %4138 = fptrunc float %3570 to half + %4139 = fptrunc float %3571 to half + %4140 = fptrunc float %3574 to half + %4141 = fptrunc float %3575 to half + %4142 = fptrunc float %3576 to half + %4143 = fptrunc float %3577 to half + %4144 = insertelement <4 x half> poison, half %4128, i64 0 + %4145 = insertelement <4 x half> %4144, half %4129, i64 1 + %4146 = insertelement <4 x half> %4145, half %4130, i64 2 + %4147 = insertelement <4 x half> %4146, half %4131, i64 3 + %4148 = bitcast <4 x half> %4147 to <2 x i32> + %4149 = shl i32 %4124, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4148, ptr addrspace(8) %3659, i32 %4149, i32 0, i32 0) + %4150 = insertelement <4 x half> poison, half %4132, i64 0 + %4151 = insertelement <4 x half> %4150, half %4133, i64 1 + %4152 = insertelement <4 x half> %4151, half %4134, i64 2 + %4153 = insertelement <4 x half> %4152, half %4135, i64 3 + %4154 = bitcast <4 x half> %4153 to <2 x i32> + %4155 = shl i32 %4125, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4154, ptr addrspace(8) %3659, i32 %4155, i32 0, i32 0) + %4156 = insertelement <4 x half> poison, half %4136, i64 0 + %4157 = insertelement <4 x half> %4156, half %4137, i64 1 + %4158 = insertelement <4 x half> %4157, half %4138, i64 2 + %4159 = insertelement <4 x half> %4158, half %4139, i64 3 + %4160 = bitcast <4 x half> %4159 to <2 x i32> + %4161 = shl i32 %4126, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4160, ptr addrspace(8) %3659, i32 %4161, i32 0, i32 0) + %4162 = insertelement <4 x half> poison, half %4140, i64 0 + %4163 = insertelement <4 x half> %4162, half %4141, i64 1 + %4164 = insertelement <4 x half> %4163, half %4142, i64 2 + %4165 = insertelement <4 x half> %4164, half %4143, i64 3 + %4166 = bitcast <4 x half> %4165 to <2 x i32> + %4167 = shl i32 %4127, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4166, ptr addrspace(8) %3659, i32 %4167, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 1030) + %4168 = add i32 %3637, %2295 + %4169 = add i32 %3637, %2302 + %4170 = add i32 %3638, %2295 + %4171 = add i32 %3638, %2302 + %4172 = fptrunc float %3484 to half + %4173 = fptrunc float %3485 to half + %4174 = fptrunc float %3486 to half + %4175 = fptrunc float %3487 to half + %4176 = fptrunc float %3490 to half + %4177 = fptrunc float %3491 to half + %4178 = fptrunc float %3492 to half + %4179 = fptrunc float %3493 to half + %4180 = fptrunc float %3496 to half + %4181 = fptrunc float %3497 to half + %4182 = fptrunc float %3498 to half + %4183 = fptrunc float %3499 to half + %4184 = fptrunc float %3502 to half + %4185 = fptrunc float %3503 to half + %4186 = fptrunc float %3504 to half + %4187 = fptrunc float %3505 to half + %4188 = insertelement <4 x half> poison, half %4172, i64 0 + %4189 = insertelement <4 x half> %4188, half %4173, i64 1 + %4190 = insertelement <4 x half> %4189, half %4174, i64 2 + %4191 = insertelement <4 x half> %4190, half %4175, i64 3 + %4192 = bitcast <4 x half> %4191 to <2 x i32> + %4193 = shl i32 %4168, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4192, ptr addrspace(8) %3659, i32 %4193, i32 0, i32 0) + %4194 = insertelement <4 x half> poison, half %4176, i64 0 + %4195 = insertelement <4 x half> %4194, half %4177, i64 1 + %4196 = insertelement <4 x half> %4195, half %4178, i64 2 + %4197 = insertelement <4 x half> %4196, half %4179, i64 3 + %4198 = bitcast <4 x half> %4197 to <2 x i32> + %4199 = shl i32 %4169, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4198, ptr addrspace(8) %3659, i32 %4199, i32 0, i32 0) + %4200 = insertelement <4 x half> poison, half %4180, i64 0 + %4201 = insertelement <4 x half> %4200, half %4181, i64 1 + %4202 = insertelement <4 x half> %4201, half %4182, i64 2 + %4203 = insertelement <4 x half> %4202, half %4183, i64 3 + %4204 = bitcast <4 x half> %4203 to <2 x i32> + %4205 = shl i32 %4170, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4204, ptr addrspace(8) %3659, i32 %4205, i32 0, i32 0) + %4206 = insertelement <4 x half> poison, half %4184, i64 0 + %4207 = insertelement <4 x half> %4206, half %4185, i64 1 + %4208 = insertelement <4 x half> %4207, half %4186, i64 2 + %4209 = insertelement <4 x half> %4208, half %4187, i64 3 + %4210 = bitcast <4 x half> %4209 to <2 x i32> + %4211 = shl i32 %4171, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4210, ptr addrspace(8) %3659, i32 %4211, i32 0, i32 0) + %4212 = add i32 %3637, %2301 + %4213 = add i32 %3637, %2300 + %4214 = add i32 %3638, %2301 + %4215 = add i32 %3638, %2300 + %4216 = fptrunc float %3508 to half + %4217 = fptrunc float %3509 to half + %4218 = fptrunc float %3510 to half + %4219 = fptrunc float %3511 to half + %4220 = fptrunc float %3514 to half + %4221 = fptrunc float %3515 to half + %4222 = fptrunc float %3516 to half + %4223 = fptrunc float %3517 to half + %4224 = fptrunc float %3520 to half + %4225 = fptrunc float %3521 to half + %4226 = fptrunc float %3522 to half + %4227 = fptrunc float %3523 to half + %4228 = fptrunc float %3526 to half + %4229 = fptrunc float %3527 to half + %4230 = fptrunc float %3528 to half + %4231 = fptrunc float %3529 to half + %4232 = insertelement <4 x half> poison, half %4216, i64 0 + %4233 = insertelement <4 x half> %4232, half %4217, i64 1 + %4234 = insertelement <4 x half> %4233, half %4218, i64 2 + %4235 = insertelement <4 x half> %4234, half %4219, i64 3 + %4236 = bitcast <4 x half> %4235 to <2 x i32> + %4237 = shl i32 %4212, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4236, ptr addrspace(8) %3659, i32 %4237, i32 0, i32 0) + %4238 = insertelement <4 x half> poison, half %4220, i64 0 + %4239 = insertelement <4 x half> %4238, half %4221, i64 1 + %4240 = insertelement <4 x half> %4239, half %4222, i64 2 + %4241 = insertelement <4 x half> %4240, half %4223, i64 3 + %4242 = bitcast <4 x half> %4241 to <2 x i32> + %4243 = shl i32 %4213, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4242, ptr addrspace(8) %3659, i32 %4243, i32 0, i32 0) + %4244 = insertelement <4 x half> poison, half %4224, i64 0 + %4245 = insertelement <4 x half> %4244, half %4225, i64 1 + %4246 = insertelement <4 x half> %4245, half %4226, i64 2 + %4247 = insertelement <4 x half> %4246, half %4227, i64 3 + %4248 = bitcast <4 x half> %4247 to <2 x i32> + %4249 = shl i32 %4214, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4248, ptr addrspace(8) %3659, i32 %4249, i32 0, i32 0) + %4250 = insertelement <4 x half> poison, half %4228, i64 0 + %4251 = insertelement <4 x half> %4250, half %4229, i64 1 + %4252 = insertelement <4 x half> %4251, half %4230, i64 2 + %4253 = insertelement <4 x half> %4252, half %4231, i64 3 + %4254 = bitcast <4 x half> %4253 to <2 x i32> + %4255 = shl i32 %4215, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4254, ptr addrspace(8) %3659, i32 %4255, i32 0, i32 0) + %4256 = add i32 %3637, %2299 + %4257 = add i32 %3637, %2298 + %4258 = add i32 %3638, %2299 + %4259 = add i32 %3638, %2298 + %4260 = fptrunc float %3580 to half + %4261 = fptrunc float %3581 to half + %4262 = fptrunc float %3582 to half + %4263 = fptrunc float %3583 to half + %4264 = fptrunc float %3586 to half + %4265 = fptrunc float %3587 to half + %4266 = fptrunc float %3588 to half + %4267 = fptrunc float %3589 to half + %4268 = fptrunc float %3592 to half + %4269 = fptrunc float %3593 to half + %4270 = fptrunc float %3594 to half + %4271 = fptrunc float %3595 to half + %4272 = fptrunc float %3598 to half + %4273 = fptrunc float %3599 to half + %4274 = fptrunc float %3600 to half + %4275 = fptrunc float %3601 to half + %4276 = insertelement <4 x half> poison, half %4260, i64 0 + %4277 = insertelement <4 x half> %4276, half %4261, i64 1 + %4278 = insertelement <4 x half> %4277, half %4262, i64 2 + %4279 = insertelement <4 x half> %4278, half %4263, i64 3 + %4280 = bitcast <4 x half> %4279 to <2 x i32> + %4281 = shl i32 %4256, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4280, ptr addrspace(8) %3659, i32 %4281, i32 0, i32 0) + %4282 = insertelement <4 x half> poison, half %4264, i64 0 + %4283 = insertelement <4 x half> %4282, half %4265, i64 1 + %4284 = insertelement <4 x half> %4283, half %4266, i64 2 + %4285 = insertelement <4 x half> %4284, half %4267, i64 3 + %4286 = bitcast <4 x half> %4285 to <2 x i32> + %4287 = shl i32 %4257, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4286, ptr addrspace(8) %3659, i32 %4287, i32 0, i32 0) + %4288 = insertelement <4 x half> poison, half %4268, i64 0 + %4289 = insertelement <4 x half> %4288, half %4269, i64 1 + %4290 = insertelement <4 x half> %4289, half %4270, i64 2 + %4291 = insertelement <4 x half> %4290, half %4271, i64 3 + %4292 = bitcast <4 x half> %4291 to <2 x i32> + %4293 = shl i32 %4258, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4292, ptr addrspace(8) %3659, i32 %4293, i32 0, i32 0) + %4294 = insertelement <4 x half> poison, half %4272, i64 0 + %4295 = insertelement <4 x half> %4294, half %4273, i64 1 + %4296 = insertelement <4 x half> %4295, half %4274, i64 2 + %4297 = insertelement <4 x half> %4296, half %4275, i64 3 + %4298 = bitcast <4 x half> %4297 to <2 x i32> + %4299 = shl i32 %4259, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4298, ptr addrspace(8) %3659, i32 %4299, i32 0, i32 0) + %4300 = add i32 %3637, %2297 + %4301 = add i32 %3637, %2296 + %4302 = add i32 %3638, %2297 + %4303 = add i32 %3638, %2296 + %4304 = fptrunc float %3604 to half + %4305 = fptrunc float %3605 to half + %4306 = fptrunc float %3606 to half + %4307 = fptrunc float %3607 to half + %4308 = fptrunc float %3610 to half + %4309 = fptrunc float %3611 to half + %4310 = fptrunc float %3612 to half + %4311 = fptrunc float %3613 to half + %4312 = fptrunc float %3616 to half + %4313 = fptrunc float %3617 to half + %4314 = fptrunc float %3618 to half + %4315 = fptrunc float %3619 to half + %4316 = fptrunc float %3622 to half + %4317 = fptrunc float %3623 to half + %4318 = fptrunc float %3624 to half + %4319 = fptrunc float %3625 to half + %4320 = insertelement <4 x half> poison, half %4304, i64 0 + %4321 = insertelement <4 x half> %4320, half %4305, i64 1 + %4322 = insertelement <4 x half> %4321, half %4306, i64 2 + %4323 = insertelement <4 x half> %4322, half %4307, i64 3 + %4324 = bitcast <4 x half> %4323 to <2 x i32> + %4325 = shl i32 %4300, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4324, ptr addrspace(8) %3659, i32 %4325, i32 0, i32 0) + %4326 = insertelement <4 x half> poison, half %4308, i64 0 + %4327 = insertelement <4 x half> %4326, half %4309, i64 1 + %4328 = insertelement <4 x half> %4327, half %4310, i64 2 + %4329 = insertelement <4 x half> %4328, half %4311, i64 3 + %4330 = bitcast <4 x half> %4329 to <2 x i32> + %4331 = shl i32 %4301, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4330, ptr addrspace(8) %3659, i32 %4331, i32 0, i32 0) + %4332 = insertelement <4 x half> poison, half %4312, i64 0 + %4333 = insertelement <4 x half> %4332, half %4313, i64 1 + %4334 = insertelement <4 x half> %4333, half %4314, i64 2 + %4335 = insertelement <4 x half> %4334, half %4315, i64 3 + %4336 = bitcast <4 x half> %4335 to <2 x i32> + %4337 = shl i32 %4302, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4336, ptr addrspace(8) %3659, i32 %4337, i32 0, i32 0) + %4338 = insertelement <4 x half> poison, half %4316, i64 0 + %4339 = insertelement <4 x half> %4338, half %4317, i64 1 + %4340 = insertelement <4 x half> %4339, half %4318, i64 2 + %4341 = insertelement <4 x half> %4340, half %4319, i64 3 + %4342 = bitcast <4 x half> %4341 to <2 x i32> + %4343 = shl i32 %4303, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4342, ptr addrspace(8) %3659, i32 %4343, i32 0, i32 0) + ret void +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workitem.id.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #3 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.s.barrier() #4 + +; Function Attrs: convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) #5 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.sched.barrier(i32 immarg) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #6 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #4 + +attributes #0 = { nofree norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #4 = { convergent mustprogress nocallback nofree nounwind willreturn } +attributes #5 = { convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 1, !"amdhsa_code_object_version", i32 400} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!3 = !DIFile(filename: "", directory: "") +!4 = distinct !DISubprogram(name: "matmul_kernel", linkageName: "matmul_kernel", scope: !3, file: !3, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 222, column: 7, scope: !4) +!8 = !DILocation(line: 224, column: 7, scope: !4) +!9 = !DILocation(line: 225, column: 7, scope: !4) +!10 = !DILocation(line: 230, column: 7, scope: !4) +!11 = !DILocation(line: 231, column: 7, scope: !4) +!12 = !DILocation(line: 232, column: 7, scope: !4) +!13 = !DILocation(line: 233, column: 7, scope: !4) +!14 = !DILocation(line: 234, column: 7, scope: !4) +!15 = !DILocation(line: 235, column: 7, scope: !4) +!16 = !DILocation(line: 236, column: 7, scope: !4) +!17 = !DILocation(line: 237, column: 7, scope: !4) +!18 = !DILocation(line: 238, column: 7, scope: !4) +!19 = !DILocation(line: 241, column: 7, scope: !4) +!20 = !DILocation(line: 242, column: 7, scope: !4) +!21 = !DILocation(line: 247, column: 7, scope: !4) +!22 = !DILocation(line: 249, column: 7, scope: !4) +!23 = !DILocation(line: 250, column: 7, scope: !4) +!24 = !DILocation(line: 255, column: 7, scope: !4) +!25 = !DILocation(line: 256, column: 7, scope: !4) +!26 = !DILocation(line: 257, column: 7, scope: !4) +!27 = !DILocation(line: 258, column: 7, scope: !4) +!28 = !DILocation(line: 259, column: 7, scope: !4) +!29 = !DILocation(line: 260, column: 7, scope: !4) +!30 = !DILocation(line: 261, column: 7, scope: !4) +!31 = !DILocation(line: 262, column: 7, scope: !4) +!32 = !DILocation(line: 263, column: 7, scope: !4) +!33 = !DILocation(line: 266, column: 7, scope: !4) +!34 = !DILocation(line: 267, column: 7, scope: !4) +!35 = !DILocation(line: 275, column: 7, scope: !4) +!36 = !DILocation(line: 280, column: 7, scope: !4) +!37 = !DILocation(line: 285, column: 7, scope: !4) +!38 = !DILocation(line: 286, column: 7, scope: !4) +!39 = !DILocation(line: 287, column: 7, scope: !4) +!40 = !DILocation(line: 288, column: 7, scope: !4) +!41 = !DILocation(line: 289, column: 7, scope: !4) +!42 = !DILocation(line: 290, column: 7, scope: !4) +!43 = !DILocation(line: 291, column: 7, scope: !4) +!44 = !DILocation(line: 292, column: 7, scope: !4) +!45 = !DILocation(line: 293, column: 7, scope: !4) +!46 = !DILocation(line: 298, column: 7, scope: !4) +!47 = !DILocation(line: 299, column: 7, scope: !4) +!48 = !DILocation(line: 300, column: 7, scope: !4) +!49 = !DILocation(line: 301, column: 7, scope: !4) +!50 = !DILocation(line: 302, column: 7, scope: !4) +!51 = !DILocation(line: 303, column: 7, scope: !4) +!52 = !DILocation(line: 304, column: 7, scope: !4) +!53 = !DILocation(line: 305, column: 7, scope: !4) +!54 = !DILocation(line: 306, column: 7, scope: !4) +!55 = !DILocation(line: 309, column: 7, scope: !4) +!56 = !DILocation(line: 310, column: 7, scope: !4) +!57 = !DILocation(line: 315, column: 7, scope: !4) +!58 = !DILocation(line: 316, column: 7, scope: !4) +!59 = !DILocation(line: 321, column: 7, scope: !4) +!60 = !DILocation(line: 322, column: 7, scope: !4) +!61 = !DILocation(line: 323, column: 7, scope: !4) +!62 = !DILocation(line: 324, column: 7, scope: !4) +!63 = !DILocation(line: 325, column: 7, scope: !4) +!64 = !DILocation(line: 326, column: 7, scope: !4) +!65 = !DILocation(line: 327, column: 7, scope: !4) +!66 = !DILocation(line: 328, column: 7, scope: !4) +!67 = !DILocation(line: 329, column: 7, scope: !4) +!68 = !DILocation(line: 334, column: 7, scope: !4) +!69 = !DILocation(line: 335, column: 7, scope: !4) +!70 = !DILocation(line: 336, column: 7, scope: !4) +!71 = !DILocation(line: 337, column: 7, scope: !4) +!72 = !DILocation(line: 338, column: 7, scope: !4) +!73 = !DILocation(line: 339, column: 7, scope: !4) +!74 = !DILocation(line: 340, column: 7, scope: !4) +!75 = !DILocation(line: 341, column: 7, scope: !4) +!76 = !DILocation(line: 342, column: 7, scope: !4) +!77 = !DILocation(line: 347, column: 7, scope: !4) +!78 = !DILocation(line: 348, column: 7, scope: !4) +!79 = !DILocation(line: 364, column: 7, scope: !4) +!80 = !DILocation(line: 365, column: 7, scope: !4) +!81 = !DILocation(line: 366, column: 7, scope: !4) +!82 = !DILocation(line: 367, column: 7, scope: !4) +!83 = !DILocation(line: 368, column: 7, scope: !4) +!84 = !DILocation(line: 369, column: 7, scope: !4) +!85 = !DILocation(line: 370, column: 7, scope: !4) +!86 = !DILocation(line: 371, column: 7, scope: !4) +!87 = !DILocation(line: 372, column: 7, scope: !4) +!88 = !DILocation(line: 373, column: 7, scope: !4) +!89 = !DILocation(line: 374, column: 7, scope: !4) +!90 = !DILocation(line: 375, column: 7, scope: !4) +!91 = !DILocation(line: 376, column: 7, scope: !4) +!92 = !DILocation(line: 377, column: 7, scope: !4) +!93 = !DILocation(line: 378, column: 7, scope: !4) +!94 = !DILocation(line: 379, column: 7, scope: !4) +!95 = !DILocation(line: 380, column: 7, scope: !4) +!96 = !DILocation(line: 381, column: 7, scope: !4) +!97 = !DILocation(line: 382, column: 7, scope: !4) +!98 = !DILocation(line: 383, column: 7, scope: !4) +!99 = !DILocation(line: 384, column: 7, scope: !4) +!100 = !DILocation(line: 385, column: 7, scope: !4) +!101 = !DILocation(line: 386, column: 7, scope: !4) +!102 = !DILocation(line: 387, column: 7, scope: !4) +!103 = !DILocation(line: 388, column: 7, scope: !4) +!104 = !DILocation(line: 389, column: 7, scope: !4) +!105 = !DILocation(line: 390, column: 7, scope: !4) +!106 = !DILocation(line: 391, column: 7, scope: !4) +!107 = !DILocation(line: 392, column: 7, scope: !4) +!108 = !DILocation(line: 393, column: 7, scope: !4) +!109 = !DILocation(line: 394, column: 7, scope: !4) +!110 = !DILocation(line: 395, column: 7, scope: !4) +!111 = !DILocation(line: 396, column: 7, scope: !4) +!112 = !DILocation(line: 412, column: 7, scope: !4) +!113 = !DILocation(line: 413, column: 7, scope: !4) +!114 = !DILocation(line: 414, column: 7, scope: !4) +!115 = !DILocation(line: 415, column: 7, scope: !4) +!116 = !DILocation(line: 416, column: 7, scope: !4) +!117 = !DILocation(line: 417, column: 7, scope: !4) +!118 = !DILocation(line: 418, column: 7, scope: !4) +!119 = !DILocation(line: 419, column: 7, scope: !4) +!120 = !DILocation(line: 420, column: 7, scope: !4) +!121 = !DILocation(line: 421, column: 7, scope: !4) +!122 = !DILocation(line: 422, column: 7, scope: !4) +!123 = !DILocation(line: 423, column: 7, scope: !4) +!124 = !DILocation(line: 424, column: 7, scope: !4) +!125 = !DILocation(line: 425, column: 7, scope: !4) +!126 = !DILocation(line: 426, column: 7, scope: !4) +!127 = !DILocation(line: 427, column: 7, scope: !4) +!128 = !DILocation(line: 428, column: 7, scope: !4) +!129 = !DILocation(line: 429, column: 7, scope: !4) +!130 = !DILocation(line: 430, column: 7, scope: !4) +!131 = !DILocation(line: 431, column: 7, scope: !4) +!132 = !DILocation(line: 432, column: 7, scope: !4) +!133 = !DILocation(line: 433, column: 7, scope: !4) +!134 = !DILocation(line: 434, column: 7, scope: !4) +!135 = !DILocation(line: 435, column: 7, scope: !4) +!136 = !DILocation(line: 436, column: 7, scope: !4) +!137 = !DILocation(line: 437, column: 7, scope: !4) +!138 = !DILocation(line: 438, column: 7, scope: !4) +!139 = !DILocation(line: 439, column: 7, scope: !4) +!140 = !DILocation(line: 440, column: 7, scope: !4) +!141 = !DILocation(line: 441, column: 7, scope: !4) +!142 = !DILocation(line: 442, column: 7, scope: !4) +!143 = !DILocation(line: 443, column: 7, scope: !4) +!144 = !DILocation(line: 444, column: 7, scope: !4) +!145 = !DILocation(line: 449, column: 7, scope: !4) +!146 = !DILocation(line: 450, column: 7, scope: !4) +!147 = !DILocation(line: 456, column: 7, scope: !4) +!148 = !DILocation(line: 457, column: 7, scope: !4) +!149 = !DILocation(line: 458, column: 7, scope: !4) +!150 = !DILocation(line: 459, column: 7, scope: !4) +!151 = !DILocation(line: 460, column: 7, scope: !4) +!152 = !DILocation(line: 461, column: 7, scope: !4) +!153 = !DILocation(line: 462, column: 7, scope: !4) +!154 = !DILocation(line: 463, column: 7, scope: !4) +!155 = !DILocation(line: 464, column: 7, scope: !4) +!156 = !DILocation(line: 469, column: 7, scope: !4) +!157 = !DILocation(line: 470, column: 7, scope: !4) +!158 = !DILocation(line: 471, column: 7, scope: !4) +!159 = !DILocation(line: 472, column: 7, scope: !4) +!160 = !DILocation(line: 473, column: 7, scope: !4) +!161 = !DILocation(line: 474, column: 7, scope: !4) +!162 = !DILocation(line: 475, column: 7, scope: !4) +!163 = !DILocation(line: 476, column: 7, scope: !4) +!164 = !DILocation(line: 477, column: 7, scope: !4) +!165 = !DILocation(line: 480, column: 7, scope: !4) +!166 = !DILocation(line: 481, column: 7, scope: !4) +!167 = !DILocation(line: 166, column: 9, scope: !4) +!168 = !DILocation(line: 174, column: 9, scope: !4) +!169 = !DILocation(line: 175, column: 9, scope: !4) +!170 = !DILocation(line: 176, column: 9, scope: !4) +!171 = !DILocation(line: 177, column: 9, scope: !4) +!172 = !DILocation(line: 178, column: 9, scope: !4) +!173 = !DILocation(line: 179, column: 9, scope: !4) +!174 = !DILocation(line: 180, column: 9, scope: !4) +!175 = !DILocation(line: 181, column: 9, scope: !4) +!176 = !DILocation(line: 182, column: 9, scope: !4) +!177 = !DILocation(line: 183, column: 9, scope: !4) +!178 = !DILocation(line: 184, column: 9, scope: !4) +!179 = !DILocation(line: 185, column: 9, scope: !4) +!180 = !DILocation(line: 186, column: 9, scope: !4) +!181 = !DILocation(line: 187, column: 9, scope: !4) +!182 = !DILocation(line: 188, column: 9, scope: !4) diff --git a/llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir b/llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir new file mode 100644 index 0000000000000..22f52e751006c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir @@ -0,0 +1,4774 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nofree norecurse nounwind +define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg nocapture readonly %0, ptr addrspace(1) inreg nocapture readonly %1, ptr addrspace(1) inreg nocapture writeonly %2, ptr addrspace(1) inreg nocapture readnone %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, ptr addrspace(1) inreg nocapture readnone %11) local_unnamed_addr #0 !dbg !4 { + %13 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %14 = sdiv i32 %13, 8 + %15 = mul i32 %13, 76 + %16 = mul i32 %14, -607 + %17 = add i32 %16, %15 + %18 = add i32 %5, 255 + %19 = sdiv i32 %18, 256 + %20 = shl nsw i32 %19, 2 + %.frozen = freeze i32 %20 + %21 = sdiv i32 %17, %.frozen + %22 = shl nsw i32 %21, 2 + %23 = mul i32 %21, %.frozen + %.decomposed = sub i32 %17, %23 + %24 = add i32 %4, 255 + %25 = sdiv i32 %24, 256 + %26 = sub nsw i32 %25, %22 + %27 = tail call i32 @llvm.smin.i32(i32 %26, i32 4) + %.decomposed.frozen = freeze i32 %.decomposed + %.frozen2426 = freeze i32 %27 + %28 = sdiv i32 %.decomposed.frozen, %.frozen2426 + %29 = mul i32 %28, %.frozen2426 + %.decomposed2427 = sub i32 %.decomposed.frozen, %29 + %30 = add nsw i32 %.decomposed2427, %22 + %31 = shl i32 %30, 8 + %32 = tail call i32 @llvm.amdgcn.workitem.id.x() + %33 = lshr i32 %32, 3 + %34 = and i32 %33, 16 + %35 = and i32 %33, 31 + %36 = or disjoint i32 %35, 32 + %37 = or disjoint i32 %35, 64 + %38 = or disjoint i32 %35, 96 + %39 = or disjoint i32 %35, 128 + %40 = or disjoint i32 %35, 160 + %41 = or disjoint i32 %35, 192 + %42 = or disjoint i32 %35, 224 + %43 = mul i32 %31, %7 + %44 = mul i32 %7, %35 + %45 = mul i32 %7, %36 + %46 = mul i32 %7, %37 + %47 = mul i32 %7, %38 + %48 = mul i32 %7, %39 + %49 = mul i32 %7, %40 + %50 = mul i32 %7, %41 + %51 = mul i32 %7, %42 + %52 = sext i32 %43 to i64 + %53 = getelementptr half, ptr addrspace(1) %0, i64 %52 + %54 = shl i32 %32, 3 + %55 = and i32 %54, 56 + %56 = add i32 %44, %55 + %57 = add i32 %45, %55 + %58 = add i32 %46, %55 + %59 = add i32 %47, %55 + %60 = add i32 %48, %55 + %61 = add i32 %49, %55 + %62 = add i32 %50, %55 + %63 = add i32 %51, %55 + %64 = getelementptr i8, ptr addrspace(1) %53, i64 128 + %65 = add i32 %6, 63 + %66 = icmp sgt i32 %65, 63 + %67 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %53, i16 0, i32 2147483646, i32 159744) + %68 = shl i32 %56, 1 + %69 = select i1 %66, i32 %68, i32 -2147483648 + %70 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %69, i32 0, i32 0) + %71 = shl i32 %57, 1 + %72 = select i1 %66, i32 %71, i32 -2147483648 + %73 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %72, i32 0, i32 0) + %74 = shl i32 %58, 1 + %75 = select i1 %66, i32 %74, i32 -2147483648 + %76 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %75, i32 0, i32 0) + %77 = shl i32 %59, 1 + %78 = select i1 %66, i32 %77, i32 -2147483648 + %79 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %78, i32 0, i32 0) + %80 = shl i32 %60, 1 + %81 = select i1 %66, i32 %80, i32 -2147483648 + %82 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %81, i32 0, i32 0) + %83 = shl i32 %61, 1 + %84 = select i1 %66, i32 %83, i32 -2147483648 + %85 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %84, i32 0, i32 0) + %86 = shl i32 %62, 1 + %87 = select i1 %66, i32 %86, i32 -2147483648 + %88 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %87, i32 0, i32 0) + %89 = shl i32 %63, 1 + %90 = select i1 %66, i32 %89, i32 -2147483648 + %91 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %90, i32 0, i32 0) + %92 = icmp sgt i32 %65, 127 + %93 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %64, i16 0, i32 2147483646, i32 159744) + %94 = select i1 %92, i32 %68, i32 -2147483648 + %95 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %94, i32 0, i32 0) + %96 = bitcast <4 x i32> %95 to <8 x half> + %97 = select i1 %92, i32 %71, i32 -2147483648 + %98 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %97, i32 0, i32 0) + %99 = bitcast <4 x i32> %98 to <8 x half> + %100 = select i1 %92, i32 %74, i32 -2147483648 + %101 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %100, i32 0, i32 0) + %102 = bitcast <4 x i32> %101 to <8 x half> + %103 = select i1 %92, i32 %77, i32 -2147483648 + %104 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %103, i32 0, i32 0) + %105 = bitcast <4 x i32> %104 to <8 x half> + %106 = select i1 %92, i32 %80, i32 -2147483648 + %107 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %106, i32 0, i32 0) + %108 = bitcast <4 x i32> %107 to <8 x half> + %109 = select i1 %92, i32 %83, i32 -2147483648 + %110 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %109, i32 0, i32 0) + %111 = bitcast <4 x i32> %110 to <8 x half> + %112 = select i1 %92, i32 %86, i32 -2147483648 + %113 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %112, i32 0, i32 0) + %114 = bitcast <4 x i32> %113 to <8 x half> + %115 = select i1 %92, i32 %89, i32 -2147483648 + %116 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %115, i32 0, i32 0) + %117 = bitcast <4 x i32> %116 to <8 x half> + %118 = shl i32 %28, 8 + %119 = mul i32 %118, %8 + %120 = mul i32 %8, %35 + %121 = mul i32 %8, %36 + %122 = mul i32 %8, %37 + %123 = mul i32 %8, %38 + %124 = mul i32 %8, %39 + %125 = mul i32 %8, %40 + %126 = mul i32 %8, %41 + %127 = mul i32 %8, %42 + %128 = sext i32 %119 to i64 + %129 = getelementptr half, ptr addrspace(1) %1, i64 %128 + %130 = add i32 %120, %55 + %131 = add i32 %121, %55 + %132 = add i32 %122, %55 + %133 = add i32 %123, %55 + %134 = add i32 %124, %55 + %135 = add i32 %125, %55 + %136 = add i32 %126, %55 + %137 = add i32 %127, %55 + %138 = getelementptr i8, ptr addrspace(1) %129, i64 128 + %139 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %129, i16 0, i32 2147483646, i32 159744) + %140 = shl i32 %130, 1 + %141 = select i1 %66, i32 %140, i32 -2147483648 + %142 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %141, i32 0, i32 0) + %143 = shl i32 %131, 1 + %144 = select i1 %66, i32 %143, i32 -2147483648 + %145 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %144, i32 0, i32 0) + %146 = shl i32 %132, 1 + %147 = select i1 %66, i32 %146, i32 -2147483648 + %148 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %147, i32 0, i32 0) + %149 = shl i32 %133, 1 + %150 = select i1 %66, i32 %149, i32 -2147483648 + %151 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %150, i32 0, i32 0) + %152 = shl i32 %134, 1 + %153 = select i1 %66, i32 %152, i32 -2147483648 + %154 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %153, i32 0, i32 0) + %155 = shl i32 %135, 1 + %156 = select i1 %66, i32 %155, i32 -2147483648 + %157 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %156, i32 0, i32 0) + %158 = shl i32 %136, 1 + %159 = select i1 %66, i32 %158, i32 -2147483648 + %160 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %159, i32 0, i32 0) + %161 = shl i32 %137, 1 + %162 = select i1 %66, i32 %161, i32 -2147483648 + %163 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %162, i32 0, i32 0) + %164 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %138, i16 0, i32 2147483646, i32 159744) + %165 = select i1 %92, i32 %140, i32 -2147483648 + %166 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %165, i32 0, i32 0) + %167 = bitcast <4 x i32> %166 to <8 x half> + %168 = select i1 %92, i32 %143, i32 -2147483648 + %169 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %168, i32 0, i32 0) + %170 = bitcast <4 x i32> %169 to <8 x half> + %171 = select i1 %92, i32 %146, i32 -2147483648 + %172 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %171, i32 0, i32 0) + %173 = bitcast <4 x i32> %172 to <8 x half> + %174 = select i1 %92, i32 %149, i32 -2147483648 + %175 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %174, i32 0, i32 0) + %176 = bitcast <4 x i32> %175 to <8 x half> + %177 = select i1 %92, i32 %152, i32 -2147483648 + %178 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %177, i32 0, i32 0) + %179 = bitcast <4 x i32> %178 to <8 x half> + %180 = select i1 %92, i32 %155, i32 -2147483648 + %181 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %180, i32 0, i32 0) + %182 = bitcast <4 x i32> %181 to <8 x half> + %183 = select i1 %92, i32 %158, i32 -2147483648 + %184 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %183, i32 0, i32 0) + %185 = bitcast <4 x i32> %184 to <8 x half> + %186 = select i1 %92, i32 %161, i32 -2147483648 + %187 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %186, i32 0, i32 0) + %188 = bitcast <4 x i32> %187 to <8 x half> + %189 = icmp sgt i32 %7, 0 + tail call void @llvm.assume(i1 %189) + %190 = icmp sgt i32 %8, 0 + tail call void @llvm.assume(i1 %190) + %191 = icmp sgt i32 %9, 0 + tail call void @llvm.assume(i1 %191) + %192 = icmp sgt i32 %10, 0 + tail call void @llvm.assume(i1 %192) + %193 = icmp sgt i32 %30, 0 + tail call void @llvm.assume(i1 %193) + %194 = icmp sgt i32 %28, 0 + tail call void @llvm.assume(i1 %194) + %195 = xor i32 %54, %32 + %196 = and i32 %195, 56 + %197 = shl nuw nsw i32 %35, 6 + %198 = or disjoint i32 %197, %196 + %199 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %198 + store <4 x i32> %70, ptr addrspace(3) %199, align 16 + %200 = or disjoint i32 %198, 2048 + %201 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %200 + store <4 x i32> %73, ptr addrspace(3) %201, align 16 + %202 = or disjoint i32 %198, 4096 + %203 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %202 + store <4 x i32> %76, ptr addrspace(3) %203, align 16 + %204 = or disjoint i32 %198, 6144 + %205 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %204 + store <4 x i32> %79, ptr addrspace(3) %205, align 16 + %206 = or disjoint i32 %198, 8192 + %207 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %206 + store <4 x i32> %82, ptr addrspace(3) %207, align 16 + %208 = or disjoint i32 %198, 10240 + %209 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %208 + store <4 x i32> %85, ptr addrspace(3) %209, align 16 + %210 = or disjoint i32 %198, 12288 + %211 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %210 + store <4 x i32> %88, ptr addrspace(3) %211, align 16 + %212 = or disjoint i32 %198, 14336 + %213 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %212 + store <4 x i32> %91, ptr addrspace(3) %213, align 16 + %214 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %198 + store <4 x i32> %142, ptr addrspace(3) %214, align 16 + %215 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %200 + store <4 x i32> %145, ptr addrspace(3) %215, align 16 + %216 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %202 + store <4 x i32> %148, ptr addrspace(3) %216, align 16 + %217 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %204 + store <4 x i32> %151, ptr addrspace(3) %217, align 16 + %218 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %206 + store <4 x i32> %154, ptr addrspace(3) %218, align 16 + %219 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %208 + store <4 x i32> %157, ptr addrspace(3) %219, align 16 + %220 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %210 + store <4 x i32> %160, ptr addrspace(3) %220, align 16 + %221 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %212 + store <4 x i32> %163, ptr addrspace(3) %221, align 16 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %222 = and i32 %32, 15 + %223 = lshr i32 %32, 4 + %224 = and i32 %223, 3 + %225 = or disjoint i32 %34, %222 + %226 = and i32 %32, 7 + %227 = xor i32 %224, %226 + %228 = shl nuw nsw i32 %227, 3 + %229 = shl nuw nsw i32 %225, 6 + %230 = or disjoint i32 %229, %228 + %231 = or disjoint i32 %229, 2048 + %232 = or disjoint i32 %231, %228 + %233 = getelementptr half, ptr addrspace(3) @global_smem, i32 %230 + %234 = load <8 x half>, ptr addrspace(3) %233, align 16 + %235 = getelementptr half, ptr addrspace(3) @global_smem, i32 %232 + %236 = load <8 x half>, ptr addrspace(3) %235, align 16 + %237 = lshr i32 %32, 2 + %238 = and i32 %237, 16 + %239 = or disjoint i32 %238, %222 + %240 = shl nuw nsw i32 %239, 6 + %241 = or disjoint i32 %228, %240 + %242 = or disjoint i32 %241, 2048 + %243 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %241 + %244 = load <8 x half>, ptr addrspace(3) %243, align 16 + %245 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %242 + %246 = load <8 x half>, ptr addrspace(3) %245, align 16 + %247 = or disjoint i32 %229, 4096 + %248 = or disjoint i32 %247, %228 + %249 = or disjoint i32 %229, 6144 + %250 = or disjoint i32 %249, %228 + %251 = getelementptr half, ptr addrspace(3) @global_smem, i32 %248 + %252 = load <8 x half>, ptr addrspace(3) %251, align 16 + %253 = getelementptr half, ptr addrspace(3) @global_smem, i32 %250 + %254 = load <8 x half>, ptr addrspace(3) %253, align 16 + %255 = or disjoint i32 %240, 4096 + %256 = or disjoint i32 %255, %228 + %257 = or disjoint i32 %256, 2048 + %258 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %256 + %259 = load <8 x half>, ptr addrspace(3) %258, align 16 + %260 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %257 + %261 = load <8 x half>, ptr addrspace(3) %260, align 16 + %262 = icmp sgt i32 %65, 191 + br i1 %262, label %.lr.ph, label %.._crit_edge_crit_edge + +.._crit_edge_crit_edge: ; preds = %12 + %.pre = or disjoint i32 %240, 8192 + %.pre1013 = or disjoint i32 %.pre, %228 + %.pre1015 = or disjoint i32 %240, 12288 + %.pre1017 = or disjoint i32 %.pre1015, %228 + %.pre1019 = or disjoint i32 %229, 8192 + %.pre1021 = or disjoint i32 %.pre1019, %228 + %.pre1023 = or disjoint i32 %229, 10240 + %.pre1025 = or disjoint i32 %.pre1023, %228 + %.pre1027 = or disjoint i32 %229, 12288 + %.pre1029 = or disjoint i32 %.pre1027, %228 + %.pre1031 = or disjoint i32 %229, 14336 + %.pre1033 = or disjoint i32 %.pre1031, %228 + %.pre1035 = or disjoint i32 %224, 4 + %.pre1037 = xor i32 %.pre1035, %226 + %.pre1039 = shl nuw nsw i32 %.pre1037, 3 + %.pre1041 = or disjoint i32 %.pre1039, %240 + %.pre1043 = or disjoint i32 %.pre1039, %229 + %.pre1045 = or disjoint i32 %231, %.pre1039 + %.pre1047 = or disjoint i32 %.pre1039, %255 + %.pre1049 = or disjoint i32 %247, %.pre1039 + %.pre1051 = or disjoint i32 %249, %.pre1039 + %.pre1053 = or disjoint i32 %.pre1039, %.pre + %.pre1055 = or disjoint i32 %.pre1053, 2048 + %.pre1057 = or disjoint i32 %.pre1039, %.pre1015 + %.pre1059 = or disjoint i32 %.pre1057, 2048 + %.pre1061 = or disjoint i32 %.pre1019, %.pre1039 + %.pre1063 = or disjoint i32 %.pre1023, %.pre1039 + %.pre1065 = or disjoint i32 %.pre1027, %.pre1039 + %.pre1067 = or disjoint i32 %.pre1031, %.pre1039 + %263 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %264 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %265 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %266 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %267 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %268 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %269 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %270 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %271 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %272 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %273 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %274 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %275 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %276 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %277 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %278 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %279 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %280 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %281 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %282 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %283 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %284 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %285 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %286 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %287 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %288 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %289 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %290 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %291 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %292 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %293 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %294 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %295 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %296 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %297 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %298 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %299 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %300 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %301 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %302 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %303 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %304 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %305 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %306 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %307 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %308 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %309 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %310 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %311 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %312 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %313 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %314 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %315 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %316 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %317 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %318 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %319 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %320 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %321 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %322 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %323 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %324 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %325 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %326 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %327 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %328 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %329 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %330 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %331 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %332 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %333 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %334 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %335 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %336 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %337 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %338 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %339 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %340 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %341 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %342 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %343 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %344 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %345 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %346 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %347 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %348 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %349 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %350 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %351 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %352 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %353 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %354 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %355 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %356 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %357 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %358 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + br label %._crit_edge + +.lr.ph: ; preds = %12 + %359 = lshr i32 %65, 6 + %invariant.op404 = or disjoint i32 %240, 6144 + %invariant.op402 = or disjoint i32 %240, 2048 + %invariant.op400 = or disjoint i32 %228, 2048 + %360 = or disjoint i32 %240, 8192 + %361 = or disjoint i32 %360, %228 + %.reass = or disjoint i32 %360, %invariant.op400 + %362 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %361 + %363 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass + %364 = or disjoint i32 %240, 12288 + %365 = or disjoint i32 %364, %228 + %.reass401 = or disjoint i32 %364, %invariant.op400 + %366 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %365 + %367 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass401 + %368 = or disjoint i32 %229, 8192 + %369 = or disjoint i32 %368, %228 + %370 = or disjoint i32 %229, 10240 + %371 = or disjoint i32 %370, %228 + %372 = getelementptr half, ptr addrspace(3) @global_smem, i32 %369 + %373 = getelementptr half, ptr addrspace(3) @global_smem, i32 %371 + %374 = or disjoint i32 %229, 12288 + %375 = or disjoint i32 %374, %228 + %376 = or disjoint i32 %229, 14336 + %377 = or disjoint i32 %376, %228 + %378 = getelementptr half, ptr addrspace(3) @global_smem, i32 %375 + %379 = getelementptr half, ptr addrspace(3) @global_smem, i32 %377 + %380 = or disjoint i32 %224, 4 + %381 = xor i32 %380, %226 + %382 = shl nuw nsw i32 %381, 3 + %383 = or disjoint i32 %382, %229 + %384 = or disjoint i32 %231, %382 + %385 = getelementptr half, ptr addrspace(3) @global_smem, i32 %383 + %386 = getelementptr half, ptr addrspace(3) @global_smem, i32 %384 + %387 = or disjoint i32 %382, %240 + %.reass403 = or disjoint i32 %382, %invariant.op402 + %388 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %387 + %389 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass403 + %390 = or disjoint i32 %382, %255 + %.reass405 = or disjoint i32 %382, %invariant.op404 + %391 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %390 + %392 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass405 + %393 = or disjoint i32 %247, %382 + %394 = or disjoint i32 %249, %382 + %395 = getelementptr half, ptr addrspace(3) @global_smem, i32 %393 + %396 = getelementptr half, ptr addrspace(3) @global_smem, i32 %394 + %397 = or disjoint i32 %382, %360 + %398 = or disjoint i32 %397, 2048 + %399 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %397 + %400 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %398 + %401 = or disjoint i32 %382, %364 + %402 = or disjoint i32 %401, 2048 + %403 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %401 + %404 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %402 + %405 = or disjoint i32 %368, %382 + %406 = or disjoint i32 %370, %382 + %407 = getelementptr half, ptr addrspace(3) @global_smem, i32 %405 + %408 = getelementptr half, ptr addrspace(3) @global_smem, i32 %406 + %409 = or disjoint i32 %374, %382 + %410 = or disjoint i32 %376, %382 + %411 = getelementptr half, ptr addrspace(3) @global_smem, i32 %409 + %412 = getelementptr half, ptr addrspace(3) @global_smem, i32 %410 + %413 = add nsw i32 %359, -3 + %414 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %415 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %416 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %417 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> + %418 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %419 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %420 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %421 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> + %422 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %423 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %424 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %425 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> + %426 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %427 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %428 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %429 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> + %430 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %431 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %432 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %433 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> + %434 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %435 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %436 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %437 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> + %438 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %439 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %440 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %441 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> + %442 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %443 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %444 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %445 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> + %446 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %447 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %448 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %449 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> + %450 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %451 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %452 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %453 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> + %454 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %455 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %456 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %457 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> + %458 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %459 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %460 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %461 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> + %462 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %463 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %464 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %465 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> + %466 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %467 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %468 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %469 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> + %470 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %471 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %472 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %473 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> + %474 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %475 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %476 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %477 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> + %478 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %479 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %480 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %481 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> + %482 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %483 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %484 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %485 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> + %486 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %487 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %488 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %489 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> + %490 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %491 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %492 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %493 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> + %494 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %495 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %496 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %497 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> + %498 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %499 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %500 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %501 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> + %502 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %503 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %504 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %505 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> + %506 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %507 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %508 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + %509 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> + br label %510 + +510: ; preds = %.lr.ph, %510 + %511 = phi float [ 0.000000e+00, %.lr.ph ], [ %1824, %510 ] + %512 = phi float [ 0.000000e+00, %.lr.ph ], [ %1825, %510 ] + %513 = phi float [ 0.000000e+00, %.lr.ph ], [ %1826, %510 ] + %514 = phi float [ 0.000000e+00, %.lr.ph ], [ %1827, %510 ] + %515 = phi float [ 0.000000e+00, %.lr.ph ], [ %1830, %510 ] + %516 = phi float [ 0.000000e+00, %.lr.ph ], [ %1831, %510 ] + %517 = phi float [ 0.000000e+00, %.lr.ph ], [ %1832, %510 ] + %518 = phi float [ 0.000000e+00, %.lr.ph ], [ %1833, %510 ] + %519 = phi float [ 0.000000e+00, %.lr.ph ], [ %1836, %510 ] + %520 = phi float [ 0.000000e+00, %.lr.ph ], [ %1837, %510 ] + %521 = phi float [ 0.000000e+00, %.lr.ph ], [ %1838, %510 ] + %522 = phi float [ 0.000000e+00, %.lr.ph ], [ %1839, %510 ] + %523 = phi float [ 0.000000e+00, %.lr.ph ], [ %1842, %510 ] + %524 = phi float [ 0.000000e+00, %.lr.ph ], [ %1843, %510 ] + %525 = phi float [ 0.000000e+00, %.lr.ph ], [ %1844, %510 ] + %526 = phi float [ 0.000000e+00, %.lr.ph ], [ %1845, %510 ] + %527 = phi float [ 0.000000e+00, %.lr.ph ], [ %1800, %510 ] + %528 = phi float [ 0.000000e+00, %.lr.ph ], [ %1801, %510 ] + %529 = phi float [ 0.000000e+00, %.lr.ph ], [ %1802, %510 ] + %530 = phi float [ 0.000000e+00, %.lr.ph ], [ %1803, %510 ] + %531 = phi float [ 0.000000e+00, %.lr.ph ], [ %1806, %510 ] + %532 = phi float [ 0.000000e+00, %.lr.ph ], [ %1807, %510 ] + %533 = phi float [ 0.000000e+00, %.lr.ph ], [ %1808, %510 ] + %534 = phi float [ 0.000000e+00, %.lr.ph ], [ %1809, %510 ] + %535 = phi float [ 0.000000e+00, %.lr.ph ], [ %1812, %510 ] + %536 = phi float [ 0.000000e+00, %.lr.ph ], [ %1813, %510 ] + %537 = phi float [ 0.000000e+00, %.lr.ph ], [ %1814, %510 ] + %538 = phi float [ 0.000000e+00, %.lr.ph ], [ %1815, %510 ] + %539 = phi float [ 0.000000e+00, %.lr.ph ], [ %1818, %510 ] + %540 = phi float [ 0.000000e+00, %.lr.ph ], [ %1819, %510 ] + %541 = phi float [ 0.000000e+00, %.lr.ph ], [ %1820, %510 ] + %542 = phi float [ 0.000000e+00, %.lr.ph ], [ %1821, %510 ] + %543 = phi float [ 0.000000e+00, %.lr.ph ], [ %1720, %510 ] + %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %1721, %510 ] + %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %1722, %510 ] + %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %1723, %510 ] + %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %1726, %510 ] + %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %1727, %510 ] + %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %1728, %510 ] + %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %1729, %510 ] + %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %1732, %510 ] + %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %1733, %510 ] + %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %1734, %510 ] + %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %1735, %510 ] + %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %1738, %510 ] + %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %1739, %510 ] + %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %1740, %510 ] + %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %1741, %510 ] + %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %1696, %510 ] + %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %1697, %510 ] + %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %1698, %510 ] + %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %1699, %510 ] + %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %1702, %510 ] + %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %1703, %510 ] + %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %1704, %510 ] + %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %1705, %510 ] + %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %1708, %510 ] + %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %1709, %510 ] + %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %1710, %510 ] + %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %1711, %510 ] + %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %1714, %510 ] + %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %1715, %510 ] + %573 = phi float [ 0.000000e+00, %.lr.ph ], [ %1716, %510 ] + %574 = phi float [ 0.000000e+00, %.lr.ph ], [ %1717, %510 ] + %575 = phi float [ 0.000000e+00, %.lr.ph ], [ %1772, %510 ] + %576 = phi float [ 0.000000e+00, %.lr.ph ], [ %1773, %510 ] + %577 = phi float [ 0.000000e+00, %.lr.ph ], [ %1774, %510 ] + %578 = phi float [ 0.000000e+00, %.lr.ph ], [ %1775, %510 ] + %579 = phi float [ 0.000000e+00, %.lr.ph ], [ %1778, %510 ] + %580 = phi float [ 0.000000e+00, %.lr.ph ], [ %1779, %510 ] + %581 = phi float [ 0.000000e+00, %.lr.ph ], [ %1780, %510 ] + %582 = phi float [ 0.000000e+00, %.lr.ph ], [ %1781, %510 ] + %583 = phi float [ 0.000000e+00, %.lr.ph ], [ %1784, %510 ] + %584 = phi float [ 0.000000e+00, %.lr.ph ], [ %1785, %510 ] + %585 = phi float [ 0.000000e+00, %.lr.ph ], [ %1786, %510 ] + %586 = phi float [ 0.000000e+00, %.lr.ph ], [ %1787, %510 ] + %587 = phi float [ 0.000000e+00, %.lr.ph ], [ %1790, %510 ] + %588 = phi float [ 0.000000e+00, %.lr.ph ], [ %1791, %510 ] + %589 = phi float [ 0.000000e+00, %.lr.ph ], [ %1792, %510 ] + %590 = phi float [ 0.000000e+00, %.lr.ph ], [ %1793, %510 ] + %591 = phi float [ 0.000000e+00, %.lr.ph ], [ %1744, %510 ] + %592 = phi float [ 0.000000e+00, %.lr.ph ], [ %1745, %510 ] + %593 = phi float [ 0.000000e+00, %.lr.ph ], [ %1746, %510 ] + %594 = phi float [ 0.000000e+00, %.lr.ph ], [ %1747, %510 ] + %595 = phi float [ 0.000000e+00, %.lr.ph ], [ %1750, %510 ] + %596 = phi float [ 0.000000e+00, %.lr.ph ], [ %1751, %510 ] + %597 = phi float [ 0.000000e+00, %.lr.ph ], [ %1752, %510 ] + %598 = phi float [ 0.000000e+00, %.lr.ph ], [ %1753, %510 ] + %599 = phi float [ 0.000000e+00, %.lr.ph ], [ %1756, %510 ] + %600 = phi float [ 0.000000e+00, %.lr.ph ], [ %1757, %510 ] + %601 = phi float [ 0.000000e+00, %.lr.ph ], [ %1758, %510 ] + %602 = phi float [ 0.000000e+00, %.lr.ph ], [ %1759, %510 ] + %603 = phi float [ 0.000000e+00, %.lr.ph ], [ %1762, %510 ] + %604 = phi float [ 0.000000e+00, %.lr.ph ], [ %1763, %510 ] + %605 = phi float [ 0.000000e+00, %.lr.ph ], [ %1764, %510 ] + %606 = phi float [ 0.000000e+00, %.lr.ph ], [ %1765, %510 ] + %607 = phi float [ 0.000000e+00, %.lr.ph ], [ %1668, %510 ] + %608 = phi float [ 0.000000e+00, %.lr.ph ], [ %1669, %510 ] + %609 = phi float [ 0.000000e+00, %.lr.ph ], [ %1670, %510 ] + %610 = phi float [ 0.000000e+00, %.lr.ph ], [ %1671, %510 ] + %611 = phi float [ 0.000000e+00, %.lr.ph ], [ %1674, %510 ] + %612 = phi float [ 0.000000e+00, %.lr.ph ], [ %1675, %510 ] + %613 = phi float [ 0.000000e+00, %.lr.ph ], [ %1676, %510 ] + %614 = phi float [ 0.000000e+00, %.lr.ph ], [ %1677, %510 ] + %615 = phi float [ 0.000000e+00, %.lr.ph ], [ %1680, %510 ] + %616 = phi float [ 0.000000e+00, %.lr.ph ], [ %1681, %510 ] + %617 = phi float [ 0.000000e+00, %.lr.ph ], [ %1682, %510 ] + %618 = phi float [ 0.000000e+00, %.lr.ph ], [ %1683, %510 ] + %619 = phi float [ 0.000000e+00, %.lr.ph ], [ %1686, %510 ] + %620 = phi float [ 0.000000e+00, %.lr.ph ], [ %1687, %510 ] + %621 = phi float [ 0.000000e+00, %.lr.ph ], [ %1688, %510 ] + %622 = phi float [ 0.000000e+00, %.lr.ph ], [ %1689, %510 ] + %623 = phi float [ 0.000000e+00, %.lr.ph ], [ %1644, %510 ] + %624 = phi float [ 0.000000e+00, %.lr.ph ], [ %1645, %510 ] + %625 = phi float [ 0.000000e+00, %.lr.ph ], [ %1646, %510 ] + %626 = phi float [ 0.000000e+00, %.lr.ph ], [ %1647, %510 ] + %627 = phi float [ 0.000000e+00, %.lr.ph ], [ %1650, %510 ] + %628 = phi float [ 0.000000e+00, %.lr.ph ], [ %1651, %510 ] + %629 = phi float [ 0.000000e+00, %.lr.ph ], [ %1652, %510 ] + %630 = phi float [ 0.000000e+00, %.lr.ph ], [ %1653, %510 ] + %631 = phi float [ 0.000000e+00, %.lr.ph ], [ %1656, %510 ] + %632 = phi float [ 0.000000e+00, %.lr.ph ], [ %1657, %510 ] + %633 = phi float [ 0.000000e+00, %.lr.ph ], [ %1658, %510 ] + %634 = phi float [ 0.000000e+00, %.lr.ph ], [ %1659, %510 ] + %635 = phi float [ 0.000000e+00, %.lr.ph ], [ %1662, %510 ] + %636 = phi float [ 0.000000e+00, %.lr.ph ], [ %1663, %510 ] + %637 = phi float [ 0.000000e+00, %.lr.ph ], [ %1664, %510 ] + %638 = phi float [ 0.000000e+00, %.lr.ph ], [ %1665, %510 ] + %639 = phi float [ 0.000000e+00, %.lr.ph ], [ %1558, %510 ] + %640 = phi float [ 0.000000e+00, %.lr.ph ], [ %1559, %510 ] + %641 = phi float [ 0.000000e+00, %.lr.ph ], [ %1560, %510 ] + %642 = phi float [ 0.000000e+00, %.lr.ph ], [ %1561, %510 ] + %643 = phi float [ 0.000000e+00, %.lr.ph ], [ %1564, %510 ] + %644 = phi float [ 0.000000e+00, %.lr.ph ], [ %1565, %510 ] + %645 = phi float [ 0.000000e+00, %.lr.ph ], [ %1566, %510 ] + %646 = phi float [ 0.000000e+00, %.lr.ph ], [ %1567, %510 ] + %647 = phi float [ 0.000000e+00, %.lr.ph ], [ %1570, %510 ] + %648 = phi float [ 0.000000e+00, %.lr.ph ], [ %1571, %510 ] + %649 = phi float [ 0.000000e+00, %.lr.ph ], [ %1572, %510 ] + %650 = phi float [ 0.000000e+00, %.lr.ph ], [ %1573, %510 ] + %651 = phi float [ 0.000000e+00, %.lr.ph ], [ %1576, %510 ] + %652 = phi float [ 0.000000e+00, %.lr.ph ], [ %1577, %510 ] + %653 = phi float [ 0.000000e+00, %.lr.ph ], [ %1578, %510 ] + %654 = phi float [ 0.000000e+00, %.lr.ph ], [ %1579, %510 ] + %655 = phi float [ 0.000000e+00, %.lr.ph ], [ %1534, %510 ] + %656 = phi float [ 0.000000e+00, %.lr.ph ], [ %1535, %510 ] + %657 = phi float [ 0.000000e+00, %.lr.ph ], [ %1536, %510 ] + %658 = phi float [ 0.000000e+00, %.lr.ph ], [ %1537, %510 ] + %659 = phi float [ 0.000000e+00, %.lr.ph ], [ %1540, %510 ] + %660 = phi float [ 0.000000e+00, %.lr.ph ], [ %1541, %510 ] + %661 = phi float [ 0.000000e+00, %.lr.ph ], [ %1542, %510 ] + %662 = phi float [ 0.000000e+00, %.lr.ph ], [ %1543, %510 ] + %663 = phi float [ 0.000000e+00, %.lr.ph ], [ %1546, %510 ] + %664 = phi float [ 0.000000e+00, %.lr.ph ], [ %1547, %510 ] + %665 = phi float [ 0.000000e+00, %.lr.ph ], [ %1548, %510 ] + %666 = phi float [ 0.000000e+00, %.lr.ph ], [ %1549, %510 ] + %667 = phi float [ 0.000000e+00, %.lr.ph ], [ %1552, %510 ] + %668 = phi float [ 0.000000e+00, %.lr.ph ], [ %1553, %510 ] + %669 = phi float [ 0.000000e+00, %.lr.ph ], [ %1554, %510 ] + %670 = phi float [ 0.000000e+00, %.lr.ph ], [ %1555, %510 ] + %671 = phi float [ 0.000000e+00, %.lr.ph ], [ %1396, %510 ] + %672 = phi float [ 0.000000e+00, %.lr.ph ], [ %1397, %510 ] + %673 = phi float [ 0.000000e+00, %.lr.ph ], [ %1398, %510 ] + %674 = phi float [ 0.000000e+00, %.lr.ph ], [ %1399, %510 ] + %675 = phi float [ 0.000000e+00, %.lr.ph ], [ %1402, %510 ] + %676 = phi float [ 0.000000e+00, %.lr.ph ], [ %1403, %510 ] + %677 = phi float [ 0.000000e+00, %.lr.ph ], [ %1404, %510 ] + %678 = phi float [ 0.000000e+00, %.lr.ph ], [ %1405, %510 ] + %679 = phi float [ 0.000000e+00, %.lr.ph ], [ %1408, %510 ] + %680 = phi float [ 0.000000e+00, %.lr.ph ], [ %1409, %510 ] + %681 = phi float [ 0.000000e+00, %.lr.ph ], [ %1410, %510 ] + %682 = phi float [ 0.000000e+00, %.lr.ph ], [ %1411, %510 ] + %683 = phi float [ 0.000000e+00, %.lr.ph ], [ %1414, %510 ] + %684 = phi float [ 0.000000e+00, %.lr.ph ], [ %1415, %510 ] + %685 = phi float [ 0.000000e+00, %.lr.ph ], [ %1416, %510 ] + %686 = phi float [ 0.000000e+00, %.lr.ph ], [ %1417, %510 ] + %687 = phi float [ 0.000000e+00, %.lr.ph ], [ %1372, %510 ] + %688 = phi float [ 0.000000e+00, %.lr.ph ], [ %1373, %510 ] + %689 = phi float [ 0.000000e+00, %.lr.ph ], [ %1374, %510 ] + %690 = phi float [ 0.000000e+00, %.lr.ph ], [ %1375, %510 ] + %691 = phi float [ 0.000000e+00, %.lr.ph ], [ %1378, %510 ] + %692 = phi float [ 0.000000e+00, %.lr.ph ], [ %1379, %510 ] + %693 = phi float [ 0.000000e+00, %.lr.ph ], [ %1380, %510 ] + %694 = phi float [ 0.000000e+00, %.lr.ph ], [ %1381, %510 ] + %695 = phi float [ 0.000000e+00, %.lr.ph ], [ %1384, %510 ] + %696 = phi float [ 0.000000e+00, %.lr.ph ], [ %1385, %510 ] + %697 = phi float [ 0.000000e+00, %.lr.ph ], [ %1386, %510 ] + %698 = phi float [ 0.000000e+00, %.lr.ph ], [ %1387, %510 ] + %699 = phi float [ 0.000000e+00, %.lr.ph ], [ %1390, %510 ] + %700 = phi float [ 0.000000e+00, %.lr.ph ], [ %1391, %510 ] + %701 = phi float [ 0.000000e+00, %.lr.ph ], [ %1392, %510 ] + %702 = phi float [ 0.000000e+00, %.lr.ph ], [ %1393, %510 ] + %703 = phi float [ 0.000000e+00, %.lr.ph ], [ %1510, %510 ] + %704 = phi float [ 0.000000e+00, %.lr.ph ], [ %1511, %510 ] + %705 = phi float [ 0.000000e+00, %.lr.ph ], [ %1512, %510 ] + %706 = phi float [ 0.000000e+00, %.lr.ph ], [ %1513, %510 ] + %707 = phi float [ 0.000000e+00, %.lr.ph ], [ %1516, %510 ] + %708 = phi float [ 0.000000e+00, %.lr.ph ], [ %1517, %510 ] + %709 = phi float [ 0.000000e+00, %.lr.ph ], [ %1518, %510 ] + %710 = phi float [ 0.000000e+00, %.lr.ph ], [ %1519, %510 ] + %711 = phi float [ 0.000000e+00, %.lr.ph ], [ %1522, %510 ] + %712 = phi float [ 0.000000e+00, %.lr.ph ], [ %1523, %510 ] + %713 = phi float [ 0.000000e+00, %.lr.ph ], [ %1524, %510 ] + %714 = phi float [ 0.000000e+00, %.lr.ph ], [ %1525, %510 ] + %715 = phi float [ 0.000000e+00, %.lr.ph ], [ %1528, %510 ] + %716 = phi float [ 0.000000e+00, %.lr.ph ], [ %1529, %510 ] + %717 = phi float [ 0.000000e+00, %.lr.ph ], [ %1530, %510 ] + %718 = phi float [ 0.000000e+00, %.lr.ph ], [ %1531, %510 ] + %719 = phi float [ 0.000000e+00, %.lr.ph ], [ %1482, %510 ] + %720 = phi float [ 0.000000e+00, %.lr.ph ], [ %1483, %510 ] + %721 = phi float [ 0.000000e+00, %.lr.ph ], [ %1484, %510 ] + %722 = phi float [ 0.000000e+00, %.lr.ph ], [ %1485, %510 ] + %723 = phi float [ 0.000000e+00, %.lr.ph ], [ %1488, %510 ] + %724 = phi float [ 0.000000e+00, %.lr.ph ], [ %1489, %510 ] + %725 = phi float [ 0.000000e+00, %.lr.ph ], [ %1490, %510 ] + %726 = phi float [ 0.000000e+00, %.lr.ph ], [ %1491, %510 ] + %727 = phi float [ 0.000000e+00, %.lr.ph ], [ %1494, %510 ] + %728 = phi float [ 0.000000e+00, %.lr.ph ], [ %1495, %510 ] + %729 = phi float [ 0.000000e+00, %.lr.ph ], [ %1496, %510 ] + %730 = phi float [ 0.000000e+00, %.lr.ph ], [ %1497, %510 ] + %731 = phi float [ 0.000000e+00, %.lr.ph ], [ %1500, %510 ] + %732 = phi float [ 0.000000e+00, %.lr.ph ], [ %1501, %510 ] + %733 = phi float [ 0.000000e+00, %.lr.ph ], [ %1502, %510 ] + %734 = phi float [ 0.000000e+00, %.lr.ph ], [ %1503, %510 ] + %735 = phi float [ 0.000000e+00, %.lr.ph ], [ %1340, %510 ] + %736 = phi float [ 0.000000e+00, %.lr.ph ], [ %1341, %510 ] + %737 = phi float [ 0.000000e+00, %.lr.ph ], [ %1342, %510 ] + %738 = phi float [ 0.000000e+00, %.lr.ph ], [ %1343, %510 ] + %739 = phi float [ 0.000000e+00, %.lr.ph ], [ %1346, %510 ] + %740 = phi float [ 0.000000e+00, %.lr.ph ], [ %1347, %510 ] + %741 = phi float [ 0.000000e+00, %.lr.ph ], [ %1348, %510 ] + %742 = phi float [ 0.000000e+00, %.lr.ph ], [ %1349, %510 ] + %743 = phi float [ 0.000000e+00, %.lr.ph ], [ %1352, %510 ] + %744 = phi float [ 0.000000e+00, %.lr.ph ], [ %1353, %510 ] + %745 = phi float [ 0.000000e+00, %.lr.ph ], [ %1354, %510 ] + %746 = phi float [ 0.000000e+00, %.lr.ph ], [ %1355, %510 ] + %747 = phi float [ 0.000000e+00, %.lr.ph ], [ %1358, %510 ] + %748 = phi float [ 0.000000e+00, %.lr.ph ], [ %1359, %510 ] + %749 = phi float [ 0.000000e+00, %.lr.ph ], [ %1360, %510 ] + %750 = phi float [ 0.000000e+00, %.lr.ph ], [ %1361, %510 ] + %751 = phi ptr addrspace(1) [ %138, %.lr.ph ], [ %1620, %510 ] + %752 = phi ptr addrspace(1) [ %64, %.lr.ph ], [ %1458, %510 ] + %753 = phi float [ 0.000000e+00, %.lr.ph ], [ %1308, %510 ] + %754 = phi float [ 0.000000e+00, %.lr.ph ], [ %1309, %510 ] + %755 = phi float [ 0.000000e+00, %.lr.ph ], [ %1310, %510 ] + %756 = phi float [ 0.000000e+00, %.lr.ph ], [ %1311, %510 ] + %757 = phi float [ 0.000000e+00, %.lr.ph ], [ %1314, %510 ] + %758 = phi float [ 0.000000e+00, %.lr.ph ], [ %1315, %510 ] + %759 = phi float [ 0.000000e+00, %.lr.ph ], [ %1316, %510 ] + %760 = phi float [ 0.000000e+00, %.lr.ph ], [ %1317, %510 ] + %761 = phi float [ 0.000000e+00, %.lr.ph ], [ %1320, %510 ] + %762 = phi float [ 0.000000e+00, %.lr.ph ], [ %1321, %510 ] + %763 = phi float [ 0.000000e+00, %.lr.ph ], [ %1322, %510 ] + %764 = phi float [ 0.000000e+00, %.lr.ph ], [ %1323, %510 ] + %765 = phi float [ 0.000000e+00, %.lr.ph ], [ %1326, %510 ] + %766 = phi float [ 0.000000e+00, %.lr.ph ], [ %1327, %510 ] + %767 = phi float [ 0.000000e+00, %.lr.ph ], [ %1328, %510 ] + %768 = phi float [ 0.000000e+00, %.lr.ph ], [ %1329, %510 ] + %769 = phi i32 [ 0, %.lr.ph ], [ %1846, %510 ] + %770 = phi <2 x half> [ %414, %.lr.ph ], [ %1910, %510 ] + %771 = phi <2 x half> [ %415, %.lr.ph ], [ %1909, %510 ] + %772 = phi <2 x half> [ %416, %.lr.ph ], [ %1908, %510 ] + %773 = phi <2 x half> [ %417, %.lr.ph ], [ %1907, %510 ] + %774 = phi <2 x half> [ %418, %.lr.ph ], [ %1906, %510 ] + %775 = phi <2 x half> [ %419, %.lr.ph ], [ %1905, %510 ] + %776 = phi <2 x half> [ %420, %.lr.ph ], [ %1904, %510 ] + %777 = phi <2 x half> [ %421, %.lr.ph ], [ %1903, %510 ] + %778 = phi <2 x half> [ %422, %.lr.ph ], [ %1902, %510 ] + %779 = phi <2 x half> [ %423, %.lr.ph ], [ %1901, %510 ] + %780 = phi <2 x half> [ %424, %.lr.ph ], [ %1900, %510 ] + %781 = phi <2 x half> [ %425, %.lr.ph ], [ %1899, %510 ] + %782 = phi <2 x half> [ %426, %.lr.ph ], [ %1898, %510 ] + %783 = phi <2 x half> [ %427, %.lr.ph ], [ %1897, %510 ] + %784 = phi <2 x half> [ %428, %.lr.ph ], [ %1896, %510 ] + %785 = phi <2 x half> [ %429, %.lr.ph ], [ %1895, %510 ] + %786 = phi <2 x half> [ %430, %.lr.ph ], [ %1894, %510 ] + %787 = phi <2 x half> [ %431, %.lr.ph ], [ %1893, %510 ] + %788 = phi <2 x half> [ %432, %.lr.ph ], [ %1892, %510 ] + %789 = phi <2 x half> [ %433, %.lr.ph ], [ %1891, %510 ] + %790 = phi <2 x half> [ %434, %.lr.ph ], [ %1890, %510 ] + %791 = phi <2 x half> [ %435, %.lr.ph ], [ %1889, %510 ] + %792 = phi <2 x half> [ %436, %.lr.ph ], [ %1888, %510 ] + %793 = phi <2 x half> [ %437, %.lr.ph ], [ %1887, %510 ] + %794 = phi <2 x half> [ %438, %.lr.ph ], [ %1886, %510 ] + %795 = phi <2 x half> [ %439, %.lr.ph ], [ %1885, %510 ] + %796 = phi <2 x half> [ %440, %.lr.ph ], [ %1884, %510 ] + %797 = phi <2 x half> [ %441, %.lr.ph ], [ %1883, %510 ] + %798 = phi <2 x half> [ %442, %.lr.ph ], [ %1882, %510 ] + %799 = phi <2 x half> [ %443, %.lr.ph ], [ %1881, %510 ] + %800 = phi <2 x half> [ %444, %.lr.ph ], [ %1880, %510 ] + %801 = phi <2 x half> [ %445, %.lr.ph ], [ %1879, %510 ] + %802 = phi <2 x half> [ %446, %.lr.ph ], [ %1878, %510 ] + %803 = phi <2 x half> [ %447, %.lr.ph ], [ %1942, %510 ] + %804 = phi <2 x half> [ %448, %.lr.ph ], [ %1941, %510 ] + %805 = phi <2 x half> [ %449, %.lr.ph ], [ %1877, %510 ] + %806 = phi <2 x half> [ %450, %.lr.ph ], [ %1876, %510 ] + %807 = phi <2 x half> [ %451, %.lr.ph ], [ %1940, %510 ] + %808 = phi <2 x half> [ %452, %.lr.ph ], [ %1939, %510 ] + %809 = phi <2 x half> [ %453, %.lr.ph ], [ %1875, %510 ] + %810 = phi <2 x half> [ %454, %.lr.ph ], [ %1874, %510 ] + %811 = phi <2 x half> [ %455, %.lr.ph ], [ %1938, %510 ] + %812 = phi <2 x half> [ %456, %.lr.ph ], [ %1937, %510 ] + %813 = phi <2 x half> [ %457, %.lr.ph ], [ %1873, %510 ] + %814 = phi <2 x half> [ %458, %.lr.ph ], [ %1872, %510 ] + %815 = phi <2 x half> [ %459, %.lr.ph ], [ %1936, %510 ] + %816 = phi <2 x half> [ %460, %.lr.ph ], [ %1935, %510 ] + %817 = phi <2 x half> [ %461, %.lr.ph ], [ %1871, %510 ] + %818 = phi <2 x half> [ %462, %.lr.ph ], [ %1870, %510 ] + %819 = phi <2 x half> [ %463, %.lr.ph ], [ %1934, %510 ] + %820 = phi <2 x half> [ %464, %.lr.ph ], [ %1933, %510 ] + %821 = phi <2 x half> [ %465, %.lr.ph ], [ %1869, %510 ] + %822 = phi <2 x half> [ %466, %.lr.ph ], [ %1868, %510 ] + %823 = phi <2 x half> [ %467, %.lr.ph ], [ %1932, %510 ] + %824 = phi <2 x half> [ %468, %.lr.ph ], [ %1931, %510 ] + %825 = phi <2 x half> [ %469, %.lr.ph ], [ %1867, %510 ] + %826 = phi <2 x half> [ %470, %.lr.ph ], [ %1866, %510 ] + %827 = phi <2 x half> [ %471, %.lr.ph ], [ %1930, %510 ] + %828 = phi <2 x half> [ %472, %.lr.ph ], [ %1929, %510 ] + %829 = phi <2 x half> [ %473, %.lr.ph ], [ %1865, %510 ] + %830 = phi <2 x half> [ %474, %.lr.ph ], [ %1864, %510 ] + %831 = phi <2 x half> [ %475, %.lr.ph ], [ %1928, %510 ] + %832 = phi <2 x half> [ %476, %.lr.ph ], [ %1927, %510 ] + %833 = phi <2 x half> [ %477, %.lr.ph ], [ %1863, %510 ] + %834 = phi <2 x half> [ %478, %.lr.ph ], [ %1862, %510 ] + %835 = phi <2 x half> [ %479, %.lr.ph ], [ %1926, %510 ] + %836 = phi <2 x half> [ %480, %.lr.ph ], [ %1925, %510 ] + %837 = phi <2 x half> [ %481, %.lr.ph ], [ %1861, %510 ] + %838 = phi <2 x half> [ %482, %.lr.ph ], [ %1860, %510 ] + %839 = phi <2 x half> [ %483, %.lr.ph ], [ %1924, %510 ] + %840 = phi <2 x half> [ %484, %.lr.ph ], [ %1923, %510 ] + %841 = phi <2 x half> [ %485, %.lr.ph ], [ %1859, %510 ] + %842 = phi <2 x half> [ %486, %.lr.ph ], [ %1858, %510 ] + %843 = phi <2 x half> [ %487, %.lr.ph ], [ %1922, %510 ] + %844 = phi <2 x half> [ %488, %.lr.ph ], [ %1921, %510 ] + %845 = phi <2 x half> [ %489, %.lr.ph ], [ %1857, %510 ] + %846 = phi <2 x half> [ %490, %.lr.ph ], [ %1856, %510 ] + %847 = phi <2 x half> [ %491, %.lr.ph ], [ %1920, %510 ] + %848 = phi <2 x half> [ %492, %.lr.ph ], [ %1919, %510 ] + %849 = phi <2 x half> [ %493, %.lr.ph ], [ %1855, %510 ] + %850 = phi <2 x half> [ %494, %.lr.ph ], [ %1854, %510 ] + %851 = phi <2 x half> [ %495, %.lr.ph ], [ %1918, %510 ] + %852 = phi <2 x half> [ %496, %.lr.ph ], [ %1917, %510 ] + %853 = phi <2 x half> [ %497, %.lr.ph ], [ %1853, %510 ] + %854 = phi <2 x half> [ %498, %.lr.ph ], [ %1852, %510 ] + %855 = phi <2 x half> [ %499, %.lr.ph ], [ %1916, %510 ] + %856 = phi <2 x half> [ %500, %.lr.ph ], [ %1915, %510 ] + %857 = phi <2 x half> [ %501, %.lr.ph ], [ %1851, %510 ] + %858 = phi <2 x half> [ %502, %.lr.ph ], [ %1850, %510 ] + %859 = phi <2 x half> [ %503, %.lr.ph ], [ %1914, %510 ] + %860 = phi <2 x half> [ %504, %.lr.ph ], [ %1913, %510 ] + %861 = phi <2 x half> [ %505, %.lr.ph ], [ %1849, %510 ] + %862 = phi <2 x half> [ %506, %.lr.ph ], [ %1848, %510 ] + %863 = phi <2 x half> [ %507, %.lr.ph ], [ %1912, %510 ] + %864 = phi <2 x half> [ %508, %.lr.ph ], [ %1911, %510 ] + %865 = phi <2 x half> [ %509, %.lr.ph ], [ %1847, %510 ] + %866 = shufflevector <2 x half> %801, <2 x half> %800, <4 x i32> + %867 = shufflevector <2 x half> %799, <2 x half> %798, <4 x i32> + %868 = shufflevector <2 x half> %797, <2 x half> %796, <4 x i32> + %869 = shufflevector <2 x half> %795, <2 x half> %794, <4 x i32> + %870 = shufflevector <2 x half> %793, <2 x half> %792, <4 x i32> + %871 = shufflevector <2 x half> %791, <2 x half> %790, <4 x i32> + %872 = shufflevector <2 x half> %789, <2 x half> %788, <4 x i32> + %873 = shufflevector <2 x half> %787, <2 x half> %786, <4 x i32> + %874 = insertelement <4 x float> poison, float %753, i64 0 + %875 = insertelement <4 x float> %874, float %754, i64 1 + %876 = insertelement <4 x float> %875, float %755, i64 2 + %877 = insertelement <4 x float> %876, float %756, i64 3 + %878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %866, <4 x float> %877, i32 0, i32 0, i32 0) + %879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %867, <4 x float> %878, i32 0, i32 0, i32 0) + %880 = insertelement <4 x float> poison, float %757, i64 0 + %881 = insertelement <4 x float> %880, float %758, i64 1 + %882 = insertelement <4 x float> %881, float %759, i64 2 + %883 = insertelement <4 x float> %882, float %760, i64 3 + %884 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %866, <4 x float> %883, i32 0, i32 0, i32 0) + %885 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %867, <4 x float> %884, i32 0, i32 0, i32 0) + %886 = insertelement <4 x float> poison, float %761, i64 0 + %887 = insertelement <4 x float> %886, float %762, i64 1 + %888 = insertelement <4 x float> %887, float %763, i64 2 + %889 = insertelement <4 x float> %888, float %764, i64 3 + %890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %868, <4 x float> %889, i32 0, i32 0, i32 0) + %891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %869, <4 x float> %890, i32 0, i32 0, i32 0) + %892 = insertelement <4 x float> poison, float %765, i64 0 + %893 = insertelement <4 x float> %892, float %766, i64 1 + %894 = insertelement <4 x float> %893, float %767, i64 2 + %895 = insertelement <4 x float> %894, float %768, i64 3 + %896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %868, <4 x float> %895, i32 0, i32 0, i32 0) + %897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %869, <4 x float> %896, i32 0, i32 0, i32 0) + %898 = shufflevector <2 x half> %777, <2 x half> %776, <4 x i32> + %899 = shufflevector <2 x half> %775, <2 x half> %774, <4 x i32> + %900 = shufflevector <2 x half> %773, <2 x half> %772, <4 x i32> + %901 = shufflevector <2 x half> %771, <2 x half> %770, <4 x i32> + %902 = insertelement <4 x float> poison, float %735, i64 0 + %903 = insertelement <4 x float> %902, float %736, i64 1 + %904 = insertelement <4 x float> %903, float %737, i64 2 + %905 = insertelement <4 x float> %904, float %738, i64 3 + %906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %866, <4 x float> %905, i32 0, i32 0, i32 0) + %907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %867, <4 x float> %906, i32 0, i32 0, i32 0) + %908 = insertelement <4 x float> poison, float %739, i64 0 + %909 = insertelement <4 x float> %908, float %740, i64 1 + %910 = insertelement <4 x float> %909, float %741, i64 2 + %911 = insertelement <4 x float> %910, float %742, i64 3 + %912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %866, <4 x float> %911, i32 0, i32 0, i32 0) + %913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %867, <4 x float> %912, i32 0, i32 0, i32 0) + %914 = insertelement <4 x float> poison, float %743, i64 0 + %915 = insertelement <4 x float> %914, float %744, i64 1 + %916 = insertelement <4 x float> %915, float %745, i64 2 + %917 = insertelement <4 x float> %916, float %746, i64 3 + %918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %868, <4 x float> %917, i32 0, i32 0, i32 0) + %919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %869, <4 x float> %918, i32 0, i32 0, i32 0) + %920 = insertelement <4 x float> poison, float %747, i64 0 + %921 = insertelement <4 x float> %920, float %748, i64 1 + %922 = insertelement <4 x float> %921, float %749, i64 2 + %923 = insertelement <4 x float> %922, float %750, i64 3 + %924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %868, <4 x float> %923, i32 0, i32 0, i32 0) + %925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %869, <4 x float> %924, i32 0, i32 0, i32 0) + %926 = load <8 x half>, ptr addrspace(3) %362, align 16 + %927 = load <8 x half>, ptr addrspace(3) %363, align 16 + %928 = load <8 x half>, ptr addrspace(3) %366, align 16 + %929 = load <8 x half>, ptr addrspace(3) %367, align 16 + %930 = shufflevector <2 x half> %785, <2 x half> %784, <4 x i32> + %931 = shufflevector <2 x half> %783, <2 x half> %782, <4 x i32> + %932 = shufflevector <2 x half> %781, <2 x half> %780, <4 x i32> + %933 = shufflevector <2 x half> %779, <2 x half> %778, <4 x i32> + %934 = insertelement <4 x float> poison, float %687, i64 0 + %935 = insertelement <4 x float> %934, float %688, i64 1 + %936 = insertelement <4 x float> %935, float %689, i64 2 + %937 = insertelement <4 x float> %936, float %690, i64 3 + %938 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %930, <4 x float> %937, i32 0, i32 0, i32 0) + %939 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %931, <4 x float> %938, i32 0, i32 0, i32 0) + %940 = insertelement <4 x float> poison, float %691, i64 0 + %941 = insertelement <4 x float> %940, float %692, i64 1 + %942 = insertelement <4 x float> %941, float %693, i64 2 + %943 = insertelement <4 x float> %942, float %694, i64 3 + %944 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %930, <4 x float> %943, i32 0, i32 0, i32 0) + %945 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %931, <4 x float> %944, i32 0, i32 0, i32 0) + %946 = insertelement <4 x float> poison, float %695, i64 0 + %947 = insertelement <4 x float> %946, float %696, i64 1 + %948 = insertelement <4 x float> %947, float %697, i64 2 + %949 = insertelement <4 x float> %948, float %698, i64 3 + %950 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %932, <4 x float> %949, i32 0, i32 0, i32 0) + %951 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %933, <4 x float> %950, i32 0, i32 0, i32 0) + %952 = insertelement <4 x float> poison, float %699, i64 0 + %953 = insertelement <4 x float> %952, float %700, i64 1 + %954 = insertelement <4 x float> %953, float %701, i64 2 + %955 = insertelement <4 x float> %954, float %702, i64 3 + %956 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %932, <4 x float> %955, i32 0, i32 0, i32 0) + %957 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %933, <4 x float> %956, i32 0, i32 0, i32 0) + %958 = insertelement <4 x float> poison, float %671, i64 0 + %959 = insertelement <4 x float> %958, float %672, i64 1 + %960 = insertelement <4 x float> %959, float %673, i64 2 + %961 = insertelement <4 x float> %960, float %674, i64 3 + %962 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %930, <4 x float> %961, i32 0, i32 0, i32 0) + %963 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %931, <4 x float> %962, i32 0, i32 0, i32 0) + %964 = insertelement <4 x float> poison, float %675, i64 0 + %965 = insertelement <4 x float> %964, float %676, i64 1 + %966 = insertelement <4 x float> %965, float %677, i64 2 + %967 = insertelement <4 x float> %966, float %678, i64 3 + %968 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %930, <4 x float> %967, i32 0, i32 0, i32 0) + %969 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %931, <4 x float> %968, i32 0, i32 0, i32 0) + %970 = insertelement <4 x float> poison, float %679, i64 0 + %971 = insertelement <4 x float> %970, float %680, i64 1 + %972 = insertelement <4 x float> %971, float %681, i64 2 + %973 = insertelement <4 x float> %972, float %682, i64 3 + %974 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %932, <4 x float> %973, i32 0, i32 0, i32 0) + %975 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %933, <4 x float> %974, i32 0, i32 0, i32 0) + %976 = insertelement <4 x float> poison, float %683, i64 0 + %977 = insertelement <4 x float> %976, float %684, i64 1 + %978 = insertelement <4 x float> %977, float %685, i64 2 + %979 = insertelement <4 x float> %978, float %686, i64 3 + %980 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %932, <4 x float> %979, i32 0, i32 0, i32 0) + %981 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %933, <4 x float> %980, i32 0, i32 0, i32 0) + %982 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> + %983 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> + %984 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> + %985 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> + %986 = insertelement <4 x float> poison, float %719, i64 0 + %987 = insertelement <4 x float> %986, float %720, i64 1 + %988 = insertelement <4 x float> %987, float %721, i64 2 + %989 = insertelement <4 x float> %988, float %722, i64 3 + %990 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %866, <4 x float> %989, i32 0, i32 0, i32 0) + %991 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %867, <4 x float> %990, i32 0, i32 0, i32 0) + %992 = insertelement <4 x float> poison, float %723, i64 0 + %993 = insertelement <4 x float> %992, float %724, i64 1 + %994 = insertelement <4 x float> %993, float %725, i64 2 + %995 = insertelement <4 x float> %994, float %726, i64 3 + %996 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %866, <4 x float> %995, i32 0, i32 0, i32 0) + %997 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %867, <4 x float> %996, i32 0, i32 0, i32 0) + %998 = insertelement <4 x float> poison, float %727, i64 0 + %999 = insertelement <4 x float> %998, float %728, i64 1 + %1000 = insertelement <4 x float> %999, float %729, i64 2 + %1001 = insertelement <4 x float> %1000, float %730, i64 3 + %1002 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %868, <4 x float> %1001, i32 0, i32 0, i32 0) + %1003 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %869, <4 x float> %1002, i32 0, i32 0, i32 0) + %1004 = insertelement <4 x float> poison, float %731, i64 0 + %1005 = insertelement <4 x float> %1004, float %732, i64 1 + %1006 = insertelement <4 x float> %1005, float %733, i64 2 + %1007 = insertelement <4 x float> %1006, float %734, i64 3 + %1008 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %868, <4 x float> %1007, i32 0, i32 0, i32 0) + %1009 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %869, <4 x float> %1008, i32 0, i32 0, i32 0) + %1010 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> + %1011 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> + %1012 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> + %1013 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> + %1014 = insertelement <4 x float> poison, float %703, i64 0 + %1015 = insertelement <4 x float> %1014, float %704, i64 1 + %1016 = insertelement <4 x float> %1015, float %705, i64 2 + %1017 = insertelement <4 x float> %1016, float %706, i64 3 + %1018 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %866, <4 x float> %1017, i32 0, i32 0, i32 0) + %1019 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %867, <4 x float> %1018, i32 0, i32 0, i32 0) + %1020 = insertelement <4 x float> poison, float %707, i64 0 + %1021 = insertelement <4 x float> %1020, float %708, i64 1 + %1022 = insertelement <4 x float> %1021, float %709, i64 2 + %1023 = insertelement <4 x float> %1022, float %710, i64 3 + %1024 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %866, <4 x float> %1023, i32 0, i32 0, i32 0) + %1025 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %867, <4 x float> %1024, i32 0, i32 0, i32 0) + %1026 = insertelement <4 x float> poison, float %711, i64 0 + %1027 = insertelement <4 x float> %1026, float %712, i64 1 + %1028 = insertelement <4 x float> %1027, float %713, i64 2 + %1029 = insertelement <4 x float> %1028, float %714, i64 3 + %1030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %868, <4 x float> %1029, i32 0, i32 0, i32 0) + %1031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %869, <4 x float> %1030, i32 0, i32 0, i32 0) + %1032 = insertelement <4 x float> poison, float %715, i64 0 + %1033 = insertelement <4 x float> %1032, float %716, i64 1 + %1034 = insertelement <4 x float> %1033, float %717, i64 2 + %1035 = insertelement <4 x float> %1034, float %718, i64 3 + %1036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %868, <4 x float> %1035, i32 0, i32 0, i32 0) + %1037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %869, <4 x float> %1036, i32 0, i32 0, i32 0) + %1038 = load <8 x half>, ptr addrspace(3) %372, align 16 + %1039 = load <8 x half>, ptr addrspace(3) %373, align 16 + %1040 = load <8 x half>, ptr addrspace(3) %378, align 16 + %1041 = load <8 x half>, ptr addrspace(3) %379, align 16 + %1042 = insertelement <4 x float> poison, float %655, i64 0 + %1043 = insertelement <4 x float> %1042, float %656, i64 1 + %1044 = insertelement <4 x float> %1043, float %657, i64 2 + %1045 = insertelement <4 x float> %1044, float %658, i64 3 + %1046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %930, <4 x float> %1045, i32 0, i32 0, i32 0) + %1047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %931, <4 x float> %1046, i32 0, i32 0, i32 0) + %1048 = insertelement <4 x float> poison, float %659, i64 0 + %1049 = insertelement <4 x float> %1048, float %660, i64 1 + %1050 = insertelement <4 x float> %1049, float %661, i64 2 + %1051 = insertelement <4 x float> %1050, float %662, i64 3 + %1052 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %930, <4 x float> %1051, i32 0, i32 0, i32 0) + %1053 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %931, <4 x float> %1052, i32 0, i32 0, i32 0) + %1054 = insertelement <4 x float> poison, float %663, i64 0 + %1055 = insertelement <4 x float> %1054, float %664, i64 1 + %1056 = insertelement <4 x float> %1055, float %665, i64 2 + %1057 = insertelement <4 x float> %1056, float %666, i64 3 + %1058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %932, <4 x float> %1057, i32 0, i32 0, i32 0) + %1059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %933, <4 x float> %1058, i32 0, i32 0, i32 0) + %1060 = insertelement <4 x float> poison, float %667, i64 0 + %1061 = insertelement <4 x float> %1060, float %668, i64 1 + %1062 = insertelement <4 x float> %1061, float %669, i64 2 + %1063 = insertelement <4 x float> %1062, float %670, i64 3 + %1064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %932, <4 x float> %1063, i32 0, i32 0, i32 0) + %1065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %933, <4 x float> %1064, i32 0, i32 0, i32 0) + %1066 = insertelement <4 x float> poison, float %639, i64 0 + %1067 = insertelement <4 x float> %1066, float %640, i64 1 + %1068 = insertelement <4 x float> %1067, float %641, i64 2 + %1069 = insertelement <4 x float> %1068, float %642, i64 3 + %1070 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %930, <4 x float> %1069, i32 0, i32 0, i32 0) + %1071 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %931, <4 x float> %1070, i32 0, i32 0, i32 0) + %1072 = insertelement <4 x float> poison, float %643, i64 0 + %1073 = insertelement <4 x float> %1072, float %644, i64 1 + %1074 = insertelement <4 x float> %1073, float %645, i64 2 + %1075 = insertelement <4 x float> %1074, float %646, i64 3 + %1076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %930, <4 x float> %1075, i32 0, i32 0, i32 0) + %1077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %931, <4 x float> %1076, i32 0, i32 0, i32 0) + %1078 = insertelement <4 x float> poison, float %647, i64 0 + %1079 = insertelement <4 x float> %1078, float %648, i64 1 + %1080 = insertelement <4 x float> %1079, float %649, i64 2 + %1081 = insertelement <4 x float> %1080, float %650, i64 3 + %1082 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %932, <4 x float> %1081, i32 0, i32 0, i32 0) + %1083 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %933, <4 x float> %1082, i32 0, i32 0, i32 0) + %1084 = insertelement <4 x float> poison, float %651, i64 0 + %1085 = insertelement <4 x float> %1084, float %652, i64 1 + %1086 = insertelement <4 x float> %1085, float %653, i64 2 + %1087 = insertelement <4 x float> %1086, float %654, i64 3 + %1088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %932, <4 x float> %1087, i32 0, i32 0, i32 0) + %1089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %933, <4 x float> %1088, i32 0, i32 0, i32 0) + %1090 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> + %1091 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> + %1092 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> + %1093 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> + %1094 = insertelement <4 x float> poison, float %623, i64 0 + %1095 = insertelement <4 x float> %1094, float %624, i64 1 + %1096 = insertelement <4 x float> %1095, float %625, i64 2 + %1097 = insertelement <4 x float> %1096, float %626, i64 3 + %1098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1090, <4 x float> %1097, i32 0, i32 0, i32 0) + %1099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1091, <4 x float> %1098, i32 0, i32 0, i32 0) + %1100 = insertelement <4 x float> poison, float %627, i64 0 + %1101 = insertelement <4 x float> %1100, float %628, i64 1 + %1102 = insertelement <4 x float> %1101, float %629, i64 2 + %1103 = insertelement <4 x float> %1102, float %630, i64 3 + %1104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1090, <4 x float> %1103, i32 0, i32 0, i32 0) + %1105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1091, <4 x float> %1104, i32 0, i32 0, i32 0) + %1106 = insertelement <4 x float> poison, float %631, i64 0 + %1107 = insertelement <4 x float> %1106, float %632, i64 1 + %1108 = insertelement <4 x float> %1107, float %633, i64 2 + %1109 = insertelement <4 x float> %1108, float %634, i64 3 + %1110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1092, <4 x float> %1109, i32 0, i32 0, i32 0) + %1111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1093, <4 x float> %1110, i32 0, i32 0, i32 0) + %1112 = insertelement <4 x float> poison, float %635, i64 0 + %1113 = insertelement <4 x float> %1112, float %636, i64 1 + %1114 = insertelement <4 x float> %1113, float %637, i64 2 + %1115 = insertelement <4 x float> %1114, float %638, i64 3 + %1116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1092, <4 x float> %1115, i32 0, i32 0, i32 0) + %1117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1093, <4 x float> %1116, i32 0, i32 0, i32 0) + %1118 = insertelement <4 x float> poison, float %607, i64 0 + %1119 = insertelement <4 x float> %1118, float %608, i64 1 + %1120 = insertelement <4 x float> %1119, float %609, i64 2 + %1121 = insertelement <4 x float> %1120, float %610, i64 3 + %1122 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1090, <4 x float> %1121, i32 0, i32 0, i32 0) + %1123 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1091, <4 x float> %1122, i32 0, i32 0, i32 0) + %1124 = insertelement <4 x float> poison, float %611, i64 0 + %1125 = insertelement <4 x float> %1124, float %612, i64 1 + %1126 = insertelement <4 x float> %1125, float %613, i64 2 + %1127 = insertelement <4 x float> %1126, float %614, i64 3 + %1128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1090, <4 x float> %1127, i32 0, i32 0, i32 0) + %1129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1091, <4 x float> %1128, i32 0, i32 0, i32 0) + %1130 = insertelement <4 x float> poison, float %615, i64 0 + %1131 = insertelement <4 x float> %1130, float %616, i64 1 + %1132 = insertelement <4 x float> %1131, float %617, i64 2 + %1133 = insertelement <4 x float> %1132, float %618, i64 3 + %1134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1092, <4 x float> %1133, i32 0, i32 0, i32 0) + %1135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1093, <4 x float> %1134, i32 0, i32 0, i32 0) + %1136 = insertelement <4 x float> poison, float %619, i64 0 + %1137 = insertelement <4 x float> %1136, float %620, i64 1 + %1138 = insertelement <4 x float> %1137, float %621, i64 2 + %1139 = insertelement <4 x float> %1138, float %622, i64 3 + %1140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1092, <4 x float> %1139, i32 0, i32 0, i32 0) + %1141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1093, <4 x float> %1140, i32 0, i32 0, i32 0) + %1142 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> + %1143 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> + %1144 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> + %1145 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> + %1146 = insertelement <4 x float> poison, float %559, i64 0 + %1147 = insertelement <4 x float> %1146, float %560, i64 1 + %1148 = insertelement <4 x float> %1147, float %561, i64 2 + %1149 = insertelement <4 x float> %1148, float %562, i64 3 + %1150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1142, <4 x float> %1149, i32 0, i32 0, i32 0) + %1151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1143, <4 x float> %1150, i32 0, i32 0, i32 0) + %1152 = insertelement <4 x float> poison, float %563, i64 0 + %1153 = insertelement <4 x float> %1152, float %564, i64 1 + %1154 = insertelement <4 x float> %1153, float %565, i64 2 + %1155 = insertelement <4 x float> %1154, float %566, i64 3 + %1156 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1142, <4 x float> %1155, i32 0, i32 0, i32 0) + %1157 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1143, <4 x float> %1156, i32 0, i32 0, i32 0) + %1158 = insertelement <4 x float> poison, float %567, i64 0 + %1159 = insertelement <4 x float> %1158, float %568, i64 1 + %1160 = insertelement <4 x float> %1159, float %569, i64 2 + %1161 = insertelement <4 x float> %1160, float %570, i64 3 + %1162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1144, <4 x float> %1161, i32 0, i32 0, i32 0) + %1163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1145, <4 x float> %1162, i32 0, i32 0, i32 0) + %1164 = insertelement <4 x float> poison, float %571, i64 0 + %1165 = insertelement <4 x float> %1164, float %572, i64 1 + %1166 = insertelement <4 x float> %1165, float %573, i64 2 + %1167 = insertelement <4 x float> %1166, float %574, i64 3 + %1168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1144, <4 x float> %1167, i32 0, i32 0, i32 0) + %1169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1145, <4 x float> %1168, i32 0, i32 0, i32 0) + %1170 = insertelement <4 x float> poison, float %543, i64 0 + %1171 = insertelement <4 x float> %1170, float %544, i64 1 + %1172 = insertelement <4 x float> %1171, float %545, i64 2 + %1173 = insertelement <4 x float> %1172, float %546, i64 3 + %1174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1142, <4 x float> %1173, i32 0, i32 0, i32 0) + %1175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1143, <4 x float> %1174, i32 0, i32 0, i32 0) + %1176 = insertelement <4 x float> poison, float %547, i64 0 + %1177 = insertelement <4 x float> %1176, float %548, i64 1 + %1178 = insertelement <4 x float> %1177, float %549, i64 2 + %1179 = insertelement <4 x float> %1178, float %550, i64 3 + %1180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1142, <4 x float> %1179, i32 0, i32 0, i32 0) + %1181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1143, <4 x float> %1180, i32 0, i32 0, i32 0) + %1182 = insertelement <4 x float> poison, float %551, i64 0 + %1183 = insertelement <4 x float> %1182, float %552, i64 1 + %1184 = insertelement <4 x float> %1183, float %553, i64 2 + %1185 = insertelement <4 x float> %1184, float %554, i64 3 + %1186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1144, <4 x float> %1185, i32 0, i32 0, i32 0) + %1187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1145, <4 x float> %1186, i32 0, i32 0, i32 0) + %1188 = insertelement <4 x float> poison, float %555, i64 0 + %1189 = insertelement <4 x float> %1188, float %556, i64 1 + %1190 = insertelement <4 x float> %1189, float %557, i64 2 + %1191 = insertelement <4 x float> %1190, float %558, i64 3 + %1192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1144, <4 x float> %1191, i32 0, i32 0, i32 0) + %1193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1145, <4 x float> %1192, i32 0, i32 0, i32 0) + %1194 = insertelement <4 x float> poison, float %591, i64 0 + %1195 = insertelement <4 x float> %1194, float %592, i64 1 + %1196 = insertelement <4 x float> %1195, float %593, i64 2 + %1197 = insertelement <4 x float> %1196, float %594, i64 3 + %1198 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1090, <4 x float> %1197, i32 0, i32 0, i32 0) + %1199 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1091, <4 x float> %1198, i32 0, i32 0, i32 0) + %1200 = insertelement <4 x float> poison, float %595, i64 0 + %1201 = insertelement <4 x float> %1200, float %596, i64 1 + %1202 = insertelement <4 x float> %1201, float %597, i64 2 + %1203 = insertelement <4 x float> %1202, float %598, i64 3 + %1204 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1090, <4 x float> %1203, i32 0, i32 0, i32 0) + %1205 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1091, <4 x float> %1204, i32 0, i32 0, i32 0) + %1206 = insertelement <4 x float> poison, float %599, i64 0 + %1207 = insertelement <4 x float> %1206, float %600, i64 1 + %1208 = insertelement <4 x float> %1207, float %601, i64 2 + %1209 = insertelement <4 x float> %1208, float %602, i64 3 + %1210 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1092, <4 x float> %1209, i32 0, i32 0, i32 0) + %1211 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1093, <4 x float> %1210, i32 0, i32 0, i32 0) + %1212 = insertelement <4 x float> poison, float %603, i64 0 + %1213 = insertelement <4 x float> %1212, float %604, i64 1 + %1214 = insertelement <4 x float> %1213, float %605, i64 2 + %1215 = insertelement <4 x float> %1214, float %606, i64 3 + %1216 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1092, <4 x float> %1215, i32 0, i32 0, i32 0) + %1217 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1093, <4 x float> %1216, i32 0, i32 0, i32 0) + %1218 = load <8 x half>, ptr addrspace(3) %385, align 16 + %1219 = load <8 x half>, ptr addrspace(3) %386, align 16 + %1220 = load <8 x half>, ptr addrspace(3) %388, align 16 + %1221 = load <8 x half>, ptr addrspace(3) %389, align 16 + %1222 = insertelement <4 x float> poison, float %575, i64 0 + %1223 = insertelement <4 x float> %1222, float %576, i64 1 + %1224 = insertelement <4 x float> %1223, float %577, i64 2 + %1225 = insertelement <4 x float> %1224, float %578, i64 3 + %1226 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1090, <4 x float> %1225, i32 0, i32 0, i32 0) + %1227 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1091, <4 x float> %1226, i32 0, i32 0, i32 0) + %1228 = insertelement <4 x float> poison, float %579, i64 0 + %1229 = insertelement <4 x float> %1228, float %580, i64 1 + %1230 = insertelement <4 x float> %1229, float %581, i64 2 + %1231 = insertelement <4 x float> %1230, float %582, i64 3 + %1232 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1090, <4 x float> %1231, i32 0, i32 0, i32 0) + %1233 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1091, <4 x float> %1232, i32 0, i32 0, i32 0) + %1234 = insertelement <4 x float> poison, float %583, i64 0 + %1235 = insertelement <4 x float> %1234, float %584, i64 1 + %1236 = insertelement <4 x float> %1235, float %585, i64 2 + %1237 = insertelement <4 x float> %1236, float %586, i64 3 + %1238 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1092, <4 x float> %1237, i32 0, i32 0, i32 0) + %1239 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1093, <4 x float> %1238, i32 0, i32 0, i32 0) + %1240 = insertelement <4 x float> poison, float %587, i64 0 + %1241 = insertelement <4 x float> %1240, float %588, i64 1 + %1242 = insertelement <4 x float> %1241, float %589, i64 2 + %1243 = insertelement <4 x float> %1242, float %590, i64 3 + %1244 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1092, <4 x float> %1243, i32 0, i32 0, i32 0) + %1245 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1093, <4 x float> %1244, i32 0, i32 0, i32 0) + %1246 = load <8 x half>, ptr addrspace(3) %391, align 16 + %1247 = load <8 x half>, ptr addrspace(3) %392, align 16 + %1248 = load <8 x half>, ptr addrspace(3) %395, align 16 + %1249 = load <8 x half>, ptr addrspace(3) %396, align 16 + %1250 = insertelement <4 x float> poison, float %527, i64 0 + %1251 = insertelement <4 x float> %1250, float %528, i64 1 + %1252 = insertelement <4 x float> %1251, float %529, i64 2 + %1253 = insertelement <4 x float> %1252, float %530, i64 3 + %1254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1142, <4 x float> %1253, i32 0, i32 0, i32 0) + %1255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1143, <4 x float> %1254, i32 0, i32 0, i32 0) + %1256 = insertelement <4 x float> poison, float %531, i64 0 + %1257 = insertelement <4 x float> %1256, float %532, i64 1 + %1258 = insertelement <4 x float> %1257, float %533, i64 2 + %1259 = insertelement <4 x float> %1258, float %534, i64 3 + %1260 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1142, <4 x float> %1259, i32 0, i32 0, i32 0) + %1261 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1143, <4 x float> %1260, i32 0, i32 0, i32 0) + %1262 = insertelement <4 x float> poison, float %535, i64 0 + %1263 = insertelement <4 x float> %1262, float %536, i64 1 + %1264 = insertelement <4 x float> %1263, float %537, i64 2 + %1265 = insertelement <4 x float> %1264, float %538, i64 3 + %1266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1144, <4 x float> %1265, i32 0, i32 0, i32 0) + %1267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1145, <4 x float> %1266, i32 0, i32 0, i32 0) + %1268 = insertelement <4 x float> poison, float %539, i64 0 + %1269 = insertelement <4 x float> %1268, float %540, i64 1 + %1270 = insertelement <4 x float> %1269, float %541, i64 2 + %1271 = insertelement <4 x float> %1270, float %542, i64 3 + %1272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1144, <4 x float> %1271, i32 0, i32 0, i32 0) + %1273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1145, <4 x float> %1272, i32 0, i32 0, i32 0) + %1274 = insertelement <4 x float> poison, float %511, i64 0 + %1275 = insertelement <4 x float> %1274, float %512, i64 1 + %1276 = insertelement <4 x float> %1275, float %513, i64 2 + %1277 = insertelement <4 x float> %1276, float %514, i64 3 + %1278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1142, <4 x float> %1277, i32 0, i32 0, i32 0) + %1279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1143, <4 x float> %1278, i32 0, i32 0, i32 0) + %1280 = insertelement <4 x float> poison, float %515, i64 0 + %1281 = insertelement <4 x float> %1280, float %516, i64 1 + %1282 = insertelement <4 x float> %1281, float %517, i64 2 + %1283 = insertelement <4 x float> %1282, float %518, i64 3 + %1284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1142, <4 x float> %1283, i32 0, i32 0, i32 0) + %1285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1143, <4 x float> %1284, i32 0, i32 0, i32 0) + %1286 = insertelement <4 x float> poison, float %519, i64 0 + %1287 = insertelement <4 x float> %1286, float %520, i64 1 + %1288 = insertelement <4 x float> %1287, float %521, i64 2 + %1289 = insertelement <4 x float> %1288, float %522, i64 3 + %1290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1144, <4 x float> %1289, i32 0, i32 0, i32 0) + %1291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1145, <4 x float> %1290, i32 0, i32 0, i32 0) + %1292 = insertelement <4 x float> poison, float %523, i64 0 + %1293 = insertelement <4 x float> %1292, float %524, i64 1 + %1294 = insertelement <4 x float> %1293, float %525, i64 2 + %1295 = insertelement <4 x float> %1294, float %526, i64 3 + %1296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1144, <4 x float> %1295, i32 0, i32 0, i32 0) + %1297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1145, <4 x float> %1296, i32 0, i32 0, i32 0) + %1298 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> + %1299 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> + %1300 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> + %1301 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> + %1302 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> + %1303 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> + %1304 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> + %1305 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> + %1306 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1298, <4 x float> %879, i32 0, i32 0, i32 0) + %1307 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1299, <4 x float> %1306, i32 0, i32 0, i32 0) + %1308 = extractelement <4 x float> %1307, i64 0 + %1309 = extractelement <4 x float> %1307, i64 1 + %1310 = extractelement <4 x float> %1307, i64 2 + %1311 = extractelement <4 x float> %1307, i64 3 + %1312 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1298, <4 x float> %885, i32 0, i32 0, i32 0) + %1313 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1299, <4 x float> %1312, i32 0, i32 0, i32 0) + %1314 = extractelement <4 x float> %1313, i64 0 + %1315 = extractelement <4 x float> %1313, i64 1 + %1316 = extractelement <4 x float> %1313, i64 2 + %1317 = extractelement <4 x float> %1313, i64 3 + %1318 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1300, <4 x float> %891, i32 0, i32 0, i32 0) + %1319 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1301, <4 x float> %1318, i32 0, i32 0, i32 0) + %1320 = extractelement <4 x float> %1319, i64 0 + %1321 = extractelement <4 x float> %1319, i64 1 + %1322 = extractelement <4 x float> %1319, i64 2 + %1323 = extractelement <4 x float> %1319, i64 3 + %1324 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1300, <4 x float> %897, i32 0, i32 0, i32 0) + %1325 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1301, <4 x float> %1324, i32 0, i32 0, i32 0) + %1326 = extractelement <4 x float> %1325, i64 0 + %1327 = extractelement <4 x float> %1325, i64 1 + %1328 = extractelement <4 x float> %1325, i64 2 + %1329 = extractelement <4 x float> %1325, i64 3 + %1330 = load <8 x half>, ptr addrspace(3) %399, align 16 + %1331 = load <8 x half>, ptr addrspace(3) %400, align 16 + %1332 = load <8 x half>, ptr addrspace(3) %403, align 16 + %1333 = load <8 x half>, ptr addrspace(3) %404, align 16 + %1334 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> + %1335 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> + %1336 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> + %1337 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> + %1338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1298, <4 x float> %907, i32 0, i32 0, i32 0) + %1339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1299, <4 x float> %1338, i32 0, i32 0, i32 0) + %1340 = extractelement <4 x float> %1339, i64 0 + %1341 = extractelement <4 x float> %1339, i64 1 + %1342 = extractelement <4 x float> %1339, i64 2 + %1343 = extractelement <4 x float> %1339, i64 3 + %1344 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1298, <4 x float> %913, i32 0, i32 0, i32 0) + %1345 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1299, <4 x float> %1344, i32 0, i32 0, i32 0) + %1346 = extractelement <4 x float> %1345, i64 0 + %1347 = extractelement <4 x float> %1345, i64 1 + %1348 = extractelement <4 x float> %1345, i64 2 + %1349 = extractelement <4 x float> %1345, i64 3 + %1350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1300, <4 x float> %919, i32 0, i32 0, i32 0) + %1351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1301, <4 x float> %1350, i32 0, i32 0, i32 0) + %1352 = extractelement <4 x float> %1351, i64 0 + %1353 = extractelement <4 x float> %1351, i64 1 + %1354 = extractelement <4 x float> %1351, i64 2 + %1355 = extractelement <4 x float> %1351, i64 3 + %1356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1300, <4 x float> %925, i32 0, i32 0, i32 0) + %1357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1301, <4 x float> %1356, i32 0, i32 0, i32 0) + %1358 = extractelement <4 x float> %1357, i64 0 + %1359 = extractelement <4 x float> %1357, i64 1 + %1360 = extractelement <4 x float> %1357, i64 2 + %1361 = extractelement <4 x float> %1357, i64 3 + %1362 = load <8 x half>, ptr addrspace(3) %407, align 16 + %1363 = load <8 x half>, ptr addrspace(3) %408, align 16 + %1364 = load <8 x half>, ptr addrspace(3) %411, align 16 + %1365 = load <8 x half>, ptr addrspace(3) %412, align 16 + %1366 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> + %1367 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> + %1368 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> + %1369 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> + %1370 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1366, <4 x float> %939, i32 0, i32 0, i32 0) + %1371 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1367, <4 x float> %1370, i32 0, i32 0, i32 0) + %1372 = extractelement <4 x float> %1371, i64 0 + %1373 = extractelement <4 x float> %1371, i64 1 + %1374 = extractelement <4 x float> %1371, i64 2 + %1375 = extractelement <4 x float> %1371, i64 3 + %1376 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1366, <4 x float> %945, i32 0, i32 0, i32 0) + %1377 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1367, <4 x float> %1376, i32 0, i32 0, i32 0) + %1378 = extractelement <4 x float> %1377, i64 0 + %1379 = extractelement <4 x float> %1377, i64 1 + %1380 = extractelement <4 x float> %1377, i64 2 + %1381 = extractelement <4 x float> %1377, i64 3 + %1382 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1368, <4 x float> %951, i32 0, i32 0, i32 0) + %1383 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1369, <4 x float> %1382, i32 0, i32 0, i32 0) + %1384 = extractelement <4 x float> %1383, i64 0 + %1385 = extractelement <4 x float> %1383, i64 1 + %1386 = extractelement <4 x float> %1383, i64 2 + %1387 = extractelement <4 x float> %1383, i64 3 + %1388 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1368, <4 x float> %957, i32 0, i32 0, i32 0) + %1389 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1369, <4 x float> %1388, i32 0, i32 0, i32 0) + %1390 = extractelement <4 x float> %1389, i64 0 + %1391 = extractelement <4 x float> %1389, i64 1 + %1392 = extractelement <4 x float> %1389, i64 2 + %1393 = extractelement <4 x float> %1389, i64 3 + %1394 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1366, <4 x float> %963, i32 0, i32 0, i32 0) + %1395 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1367, <4 x float> %1394, i32 0, i32 0, i32 0) + %1396 = extractelement <4 x float> %1395, i64 0 + %1397 = extractelement <4 x float> %1395, i64 1 + %1398 = extractelement <4 x float> %1395, i64 2 + %1399 = extractelement <4 x float> %1395, i64 3 + %1400 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1366, <4 x float> %969, i32 0, i32 0, i32 0) + %1401 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1367, <4 x float> %1400, i32 0, i32 0, i32 0) + %1402 = extractelement <4 x float> %1401, i64 0 + %1403 = extractelement <4 x float> %1401, i64 1 + %1404 = extractelement <4 x float> %1401, i64 2 + %1405 = extractelement <4 x float> %1401, i64 3 + %1406 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1368, <4 x float> %975, i32 0, i32 0, i32 0) + %1407 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1369, <4 x float> %1406, i32 0, i32 0, i32 0) + %1408 = extractelement <4 x float> %1407, i64 0 + %1409 = extractelement <4 x float> %1407, i64 1 + %1410 = extractelement <4 x float> %1407, i64 2 + %1411 = extractelement <4 x float> %1407, i64 3 + %1412 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1368, <4 x float> %981, i32 0, i32 0, i32 0) + %1413 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1369, <4 x float> %1412, i32 0, i32 0, i32 0) + %1414 = extractelement <4 x float> %1413, i64 0 + %1415 = extractelement <4 x float> %1413, i64 1 + %1416 = extractelement <4 x float> %1413, i64 2 + %1417 = extractelement <4 x float> %1413, i64 3 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %1418 = shufflevector <2 x half> %865, <2 x half> %864, <8 x i32> + %1419 = shufflevector <2 x half> %863, <2 x half> poison, <8 x i32> + %1420 = shufflevector <8 x half> %1418, <8 x half> %1419, <8 x i32> + %1421 = shufflevector <2 x half> %862, <2 x half> poison, <8 x i32> + %1422 = shufflevector <8 x half> %1420, <8 x half> %1421, <8 x i32> + store <8 x half> %1422, ptr addrspace(3) %199, align 16 + %1423 = shufflevector <2 x half> %861, <2 x half> %860, <8 x i32> + %1424 = shufflevector <2 x half> %859, <2 x half> poison, <8 x i32> + %1425 = shufflevector <8 x half> %1423, <8 x half> %1424, <8 x i32> + %1426 = shufflevector <2 x half> %858, <2 x half> poison, <8 x i32> + %1427 = shufflevector <8 x half> %1425, <8 x half> %1426, <8 x i32> + store <8 x half> %1427, ptr addrspace(3) %201, align 16 + %1428 = shufflevector <2 x half> %857, <2 x half> %856, <8 x i32> + %1429 = shufflevector <2 x half> %855, <2 x half> poison, <8 x i32> + %1430 = shufflevector <8 x half> %1428, <8 x half> %1429, <8 x i32> + %1431 = shufflevector <2 x half> %854, <2 x half> poison, <8 x i32> + %1432 = shufflevector <8 x half> %1430, <8 x half> %1431, <8 x i32> + store <8 x half> %1432, ptr addrspace(3) %203, align 16 + %1433 = shufflevector <2 x half> %853, <2 x half> %852, <8 x i32> + %1434 = shufflevector <2 x half> %851, <2 x half> poison, <8 x i32> + %1435 = shufflevector <8 x half> %1433, <8 x half> %1434, <8 x i32> + %1436 = shufflevector <2 x half> %850, <2 x half> poison, <8 x i32> + %1437 = shufflevector <8 x half> %1435, <8 x half> %1436, <8 x i32> + store <8 x half> %1437, ptr addrspace(3) %205, align 16 + %1438 = shufflevector <2 x half> %849, <2 x half> %848, <8 x i32> + %1439 = shufflevector <2 x half> %847, <2 x half> poison, <8 x i32> + %1440 = shufflevector <8 x half> %1438, <8 x half> %1439, <8 x i32> + %1441 = shufflevector <2 x half> %846, <2 x half> poison, <8 x i32> + %1442 = shufflevector <8 x half> %1440, <8 x half> %1441, <8 x i32> + store <8 x half> %1442, ptr addrspace(3) %207, align 16 + %1443 = shufflevector <2 x half> %845, <2 x half> %844, <8 x i32> + %1444 = shufflevector <2 x half> %843, <2 x half> poison, <8 x i32> + %1445 = shufflevector <8 x half> %1443, <8 x half> %1444, <8 x i32> + %1446 = shufflevector <2 x half> %842, <2 x half> poison, <8 x i32> + %1447 = shufflevector <8 x half> %1445, <8 x half> %1446, <8 x i32> + store <8 x half> %1447, ptr addrspace(3) %209, align 16 + %1448 = shufflevector <2 x half> %841, <2 x half> %840, <8 x i32> + %1449 = shufflevector <2 x half> %839, <2 x half> poison, <8 x i32> + %1450 = shufflevector <8 x half> %1448, <8 x half> %1449, <8 x i32> + %1451 = shufflevector <2 x half> %838, <2 x half> poison, <8 x i32> + %1452 = shufflevector <8 x half> %1450, <8 x half> %1451, <8 x i32> + store <8 x half> %1452, ptr addrspace(3) %211, align 16 + %1453 = shufflevector <2 x half> %837, <2 x half> %836, <8 x i32> + %1454 = shufflevector <2 x half> %835, <2 x half> poison, <8 x i32> + %1455 = shufflevector <8 x half> %1453, <8 x half> %1454, <8 x i32> + %1456 = shufflevector <2 x half> %834, <2 x half> poison, <8 x i32> + %1457 = shufflevector <8 x half> %1455, <8 x half> %1456, <8 x i32> + store <8 x half> %1457, ptr addrspace(3) %213, align 16 + %1458 = getelementptr i8, ptr addrspace(1) %752, i64 128 + %1459 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1458, i16 0, i32 2147483646, i32 159744) + %1460 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %68, i32 0, i32 0) + %1461 = bitcast <4 x i32> %1460 to <8 x half> + %1462 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %71, i32 0, i32 0) + %1463 = bitcast <4 x i32> %1462 to <8 x half> + %1464 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %74, i32 0, i32 0) + %1465 = bitcast <4 x i32> %1464 to <8 x half> + %1466 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %77, i32 0, i32 0) + %1467 = bitcast <4 x i32> %1466 to <8 x half> + %1468 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %80, i32 0, i32 0) + %1469 = bitcast <4 x i32> %1468 to <8 x half> + %1470 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %83, i32 0, i32 0) + %1471 = bitcast <4 x i32> %1470 to <8 x half> + %1472 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %86, i32 0, i32 0) + %1473 = bitcast <4 x i32> %1472 to <8 x half> + %1474 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %89, i32 0, i32 0) + %1475 = bitcast <4 x i32> %1474 to <8 x half> + %1476 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> + %1477 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> + %1478 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> + %1479 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> + %1480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1298, <4 x float> %991, i32 0, i32 0, i32 0) + %1481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1299, <4 x float> %1480, i32 0, i32 0, i32 0) + %1482 = extractelement <4 x float> %1481, i64 0 + %1483 = extractelement <4 x float> %1481, i64 1 + %1484 = extractelement <4 x float> %1481, i64 2 + %1485 = extractelement <4 x float> %1481, i64 3 + %1486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1298, <4 x float> %997, i32 0, i32 0, i32 0) + %1487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1299, <4 x float> %1486, i32 0, i32 0, i32 0) + %1488 = extractelement <4 x float> %1487, i64 0 + %1489 = extractelement <4 x float> %1487, i64 1 + %1490 = extractelement <4 x float> %1487, i64 2 + %1491 = extractelement <4 x float> %1487, i64 3 + %1492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1300, <4 x float> %1003, i32 0, i32 0, i32 0) + %1493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1301, <4 x float> %1492, i32 0, i32 0, i32 0) + %1494 = extractelement <4 x float> %1493, i64 0 + %1495 = extractelement <4 x float> %1493, i64 1 + %1496 = extractelement <4 x float> %1493, i64 2 + %1497 = extractelement <4 x float> %1493, i64 3 + %1498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1300, <4 x float> %1009, i32 0, i32 0, i32 0) + %1499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1301, <4 x float> %1498, i32 0, i32 0, i32 0) + %1500 = extractelement <4 x float> %1499, i64 0 + %1501 = extractelement <4 x float> %1499, i64 1 + %1502 = extractelement <4 x float> %1499, i64 2 + %1503 = extractelement <4 x float> %1499, i64 3 + %1504 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> + %1505 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> + %1506 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> + %1507 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> + %1508 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1298, <4 x float> %1019, i32 0, i32 0, i32 0) + %1509 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1299, <4 x float> %1508, i32 0, i32 0, i32 0) + %1510 = extractelement <4 x float> %1509, i64 0 + %1511 = extractelement <4 x float> %1509, i64 1 + %1512 = extractelement <4 x float> %1509, i64 2 + %1513 = extractelement <4 x float> %1509, i64 3 + %1514 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1298, <4 x float> %1025, i32 0, i32 0, i32 0) + %1515 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1299, <4 x float> %1514, i32 0, i32 0, i32 0) + %1516 = extractelement <4 x float> %1515, i64 0 + %1517 = extractelement <4 x float> %1515, i64 1 + %1518 = extractelement <4 x float> %1515, i64 2 + %1519 = extractelement <4 x float> %1515, i64 3 + %1520 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1300, <4 x float> %1031, i32 0, i32 0, i32 0) + %1521 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1301, <4 x float> %1520, i32 0, i32 0, i32 0) + %1522 = extractelement <4 x float> %1521, i64 0 + %1523 = extractelement <4 x float> %1521, i64 1 + %1524 = extractelement <4 x float> %1521, i64 2 + %1525 = extractelement <4 x float> %1521, i64 3 + %1526 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1300, <4 x float> %1037, i32 0, i32 0, i32 0) + %1527 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1301, <4 x float> %1526, i32 0, i32 0, i32 0) + %1528 = extractelement <4 x float> %1527, i64 0 + %1529 = extractelement <4 x float> %1527, i64 1 + %1530 = extractelement <4 x float> %1527, i64 2 + %1531 = extractelement <4 x float> %1527, i64 3 + %1532 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1366, <4 x float> %1047, i32 0, i32 0, i32 0) + %1533 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1367, <4 x float> %1532, i32 0, i32 0, i32 0) + %1534 = extractelement <4 x float> %1533, i64 0 + %1535 = extractelement <4 x float> %1533, i64 1 + %1536 = extractelement <4 x float> %1533, i64 2 + %1537 = extractelement <4 x float> %1533, i64 3 + %1538 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1366, <4 x float> %1053, i32 0, i32 0, i32 0) + %1539 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1367, <4 x float> %1538, i32 0, i32 0, i32 0) + %1540 = extractelement <4 x float> %1539, i64 0 + %1541 = extractelement <4 x float> %1539, i64 1 + %1542 = extractelement <4 x float> %1539, i64 2 + %1543 = extractelement <4 x float> %1539, i64 3 + %1544 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1368, <4 x float> %1059, i32 0, i32 0, i32 0) + %1545 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1369, <4 x float> %1544, i32 0, i32 0, i32 0) + %1546 = extractelement <4 x float> %1545, i64 0 + %1547 = extractelement <4 x float> %1545, i64 1 + %1548 = extractelement <4 x float> %1545, i64 2 + %1549 = extractelement <4 x float> %1545, i64 3 + %1550 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1368, <4 x float> %1065, i32 0, i32 0, i32 0) + %1551 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1369, <4 x float> %1550, i32 0, i32 0, i32 0) + %1552 = extractelement <4 x float> %1551, i64 0 + %1553 = extractelement <4 x float> %1551, i64 1 + %1554 = extractelement <4 x float> %1551, i64 2 + %1555 = extractelement <4 x float> %1551, i64 3 + %1556 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1366, <4 x float> %1071, i32 0, i32 0, i32 0) + %1557 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1367, <4 x float> %1556, i32 0, i32 0, i32 0) + %1558 = extractelement <4 x float> %1557, i64 0 + %1559 = extractelement <4 x float> %1557, i64 1 + %1560 = extractelement <4 x float> %1557, i64 2 + %1561 = extractelement <4 x float> %1557, i64 3 + %1562 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1366, <4 x float> %1077, i32 0, i32 0, i32 0) + %1563 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1367, <4 x float> %1562, i32 0, i32 0, i32 0) + %1564 = extractelement <4 x float> %1563, i64 0 + %1565 = extractelement <4 x float> %1563, i64 1 + %1566 = extractelement <4 x float> %1563, i64 2 + %1567 = extractelement <4 x float> %1563, i64 3 + %1568 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1368, <4 x float> %1083, i32 0, i32 0, i32 0) + %1569 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1369, <4 x float> %1568, i32 0, i32 0, i32 0) + %1570 = extractelement <4 x float> %1569, i64 0 + %1571 = extractelement <4 x float> %1569, i64 1 + %1572 = extractelement <4 x float> %1569, i64 2 + %1573 = extractelement <4 x float> %1569, i64 3 + %1574 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1368, <4 x float> %1089, i32 0, i32 0, i32 0) + %1575 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1369, <4 x float> %1574, i32 0, i32 0, i32 0) + %1576 = extractelement <4 x float> %1575, i64 0 + %1577 = extractelement <4 x float> %1575, i64 1 + %1578 = extractelement <4 x float> %1575, i64 2 + %1579 = extractelement <4 x float> %1575, i64 3 + %1580 = shufflevector <2 x half> %833, <2 x half> %832, <8 x i32> + %1581 = shufflevector <2 x half> %831, <2 x half> poison, <8 x i32> + %1582 = shufflevector <8 x half> %1580, <8 x half> %1581, <8 x i32> + %1583 = shufflevector <2 x half> %830, <2 x half> poison, <8 x i32> + %1584 = shufflevector <8 x half> %1582, <8 x half> %1583, <8 x i32> + store <8 x half> %1584, ptr addrspace(3) %214, align 16 + %1585 = shufflevector <2 x half> %829, <2 x half> %828, <8 x i32> + %1586 = shufflevector <2 x half> %827, <2 x half> poison, <8 x i32> + %1587 = shufflevector <8 x half> %1585, <8 x half> %1586, <8 x i32> + %1588 = shufflevector <2 x half> %826, <2 x half> poison, <8 x i32> + %1589 = shufflevector <8 x half> %1587, <8 x half> %1588, <8 x i32> + store <8 x half> %1589, ptr addrspace(3) %215, align 16 + %1590 = shufflevector <2 x half> %825, <2 x half> %824, <8 x i32> + %1591 = shufflevector <2 x half> %823, <2 x half> poison, <8 x i32> + %1592 = shufflevector <8 x half> %1590, <8 x half> %1591, <8 x i32> + %1593 = shufflevector <2 x half> %822, <2 x half> poison, <8 x i32> + %1594 = shufflevector <8 x half> %1592, <8 x half> %1593, <8 x i32> + store <8 x half> %1594, ptr addrspace(3) %216, align 16 + %1595 = shufflevector <2 x half> %821, <2 x half> %820, <8 x i32> + %1596 = shufflevector <2 x half> %819, <2 x half> poison, <8 x i32> + %1597 = shufflevector <8 x half> %1595, <8 x half> %1596, <8 x i32> + %1598 = shufflevector <2 x half> %818, <2 x half> poison, <8 x i32> + %1599 = shufflevector <8 x half> %1597, <8 x half> %1598, <8 x i32> + store <8 x half> %1599, ptr addrspace(3) %217, align 16 + %1600 = shufflevector <2 x half> %817, <2 x half> %816, <8 x i32> + %1601 = shufflevector <2 x half> %815, <2 x half> poison, <8 x i32> + %1602 = shufflevector <8 x half> %1600, <8 x half> %1601, <8 x i32> + %1603 = shufflevector <2 x half> %814, <2 x half> poison, <8 x i32> + %1604 = shufflevector <8 x half> %1602, <8 x half> %1603, <8 x i32> + store <8 x half> %1604, ptr addrspace(3) %218, align 16 + %1605 = shufflevector <2 x half> %813, <2 x half> %812, <8 x i32> + %1606 = shufflevector <2 x half> %811, <2 x half> poison, <8 x i32> + %1607 = shufflevector <8 x half> %1605, <8 x half> %1606, <8 x i32> + %1608 = shufflevector <2 x half> %810, <2 x half> poison, <8 x i32> + %1609 = shufflevector <8 x half> %1607, <8 x half> %1608, <8 x i32> + store <8 x half> %1609, ptr addrspace(3) %219, align 16 + %1610 = shufflevector <2 x half> %809, <2 x half> %808, <8 x i32> + %1611 = shufflevector <2 x half> %807, <2 x half> poison, <8 x i32> + %1612 = shufflevector <8 x half> %1610, <8 x half> %1611, <8 x i32> + %1613 = shufflevector <2 x half> %806, <2 x half> poison, <8 x i32> + %1614 = shufflevector <8 x half> %1612, <8 x half> %1613, <8 x i32> + store <8 x half> %1614, ptr addrspace(3) %220, align 16 + %1615 = shufflevector <2 x half> %805, <2 x half> %804, <8 x i32> + %1616 = shufflevector <2 x half> %803, <2 x half> poison, <8 x i32> + %1617 = shufflevector <8 x half> %1615, <8 x half> %1616, <8 x i32> + %1618 = shufflevector <2 x half> %802, <2 x half> poison, <8 x i32> + %1619 = shufflevector <8 x half> %1617, <8 x half> %1618, <8 x i32> + store <8 x half> %1619, ptr addrspace(3) %221, align 16 + %1620 = getelementptr i8, ptr addrspace(1) %751, i64 128 + %1621 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1620, i16 0, i32 2147483646, i32 159744) + %1622 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %140, i32 0, i32 0) + %1623 = bitcast <4 x i32> %1622 to <8 x half> + %1624 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %143, i32 0, i32 0) + %1625 = bitcast <4 x i32> %1624 to <8 x half> + %1626 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %146, i32 0, i32 0) + %1627 = bitcast <4 x i32> %1626 to <8 x half> + %1628 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %149, i32 0, i32 0) + %1629 = bitcast <4 x i32> %1628 to <8 x half> + %1630 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %152, i32 0, i32 0) + %1631 = bitcast <4 x i32> %1630 to <8 x half> + %1632 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %155, i32 0, i32 0) + %1633 = bitcast <4 x i32> %1632 to <8 x half> + %1634 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %158, i32 0, i32 0) + %1635 = bitcast <4 x i32> %1634 to <8 x half> + %1636 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %161, i32 0, i32 0) + %1637 = bitcast <4 x i32> %1636 to <8 x half> + %1638 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> + %1639 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> + %1640 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> + %1641 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> + %1642 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1638, <4 x float> %1099, i32 0, i32 0, i32 0) + %1643 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1639, <4 x float> %1642, i32 0, i32 0, i32 0) + %1644 = extractelement <4 x float> %1643, i64 0 + %1645 = extractelement <4 x float> %1643, i64 1 + %1646 = extractelement <4 x float> %1643, i64 2 + %1647 = extractelement <4 x float> %1643, i64 3 + %1648 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1638, <4 x float> %1105, i32 0, i32 0, i32 0) + %1649 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1639, <4 x float> %1648, i32 0, i32 0, i32 0) + %1650 = extractelement <4 x float> %1649, i64 0 + %1651 = extractelement <4 x float> %1649, i64 1 + %1652 = extractelement <4 x float> %1649, i64 2 + %1653 = extractelement <4 x float> %1649, i64 3 + %1654 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1640, <4 x float> %1111, i32 0, i32 0, i32 0) + %1655 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1641, <4 x float> %1654, i32 0, i32 0, i32 0) + %1656 = extractelement <4 x float> %1655, i64 0 + %1657 = extractelement <4 x float> %1655, i64 1 + %1658 = extractelement <4 x float> %1655, i64 2 + %1659 = extractelement <4 x float> %1655, i64 3 + %1660 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1640, <4 x float> %1117, i32 0, i32 0, i32 0) + %1661 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1641, <4 x float> %1660, i32 0, i32 0, i32 0) + %1662 = extractelement <4 x float> %1661, i64 0 + %1663 = extractelement <4 x float> %1661, i64 1 + %1664 = extractelement <4 x float> %1661, i64 2 + %1665 = extractelement <4 x float> %1661, i64 3 + %1666 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1638, <4 x float> %1123, i32 0, i32 0, i32 0) + %1667 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1639, <4 x float> %1666, i32 0, i32 0, i32 0) + %1668 = extractelement <4 x float> %1667, i64 0 + %1669 = extractelement <4 x float> %1667, i64 1 + %1670 = extractelement <4 x float> %1667, i64 2 + %1671 = extractelement <4 x float> %1667, i64 3 + %1672 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1638, <4 x float> %1129, i32 0, i32 0, i32 0) + %1673 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1639, <4 x float> %1672, i32 0, i32 0, i32 0) + %1674 = extractelement <4 x float> %1673, i64 0 + %1675 = extractelement <4 x float> %1673, i64 1 + %1676 = extractelement <4 x float> %1673, i64 2 + %1677 = extractelement <4 x float> %1673, i64 3 + %1678 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1640, <4 x float> %1135, i32 0, i32 0, i32 0) + %1679 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1641, <4 x float> %1678, i32 0, i32 0, i32 0) + %1680 = extractelement <4 x float> %1679, i64 0 + %1681 = extractelement <4 x float> %1679, i64 1 + %1682 = extractelement <4 x float> %1679, i64 2 + %1683 = extractelement <4 x float> %1679, i64 3 + %1684 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1640, <4 x float> %1141, i32 0, i32 0, i32 0) + %1685 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1641, <4 x float> %1684, i32 0, i32 0, i32 0) + %1686 = extractelement <4 x float> %1685, i64 0 + %1687 = extractelement <4 x float> %1685, i64 1 + %1688 = extractelement <4 x float> %1685, i64 2 + %1689 = extractelement <4 x float> %1685, i64 3 + %1690 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> + %1691 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> + %1692 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> + %1693 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> + %1694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1690, <4 x float> %1151, i32 0, i32 0, i32 0) + %1695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1691, <4 x float> %1694, i32 0, i32 0, i32 0) + %1696 = extractelement <4 x float> %1695, i64 0 + %1697 = extractelement <4 x float> %1695, i64 1 + %1698 = extractelement <4 x float> %1695, i64 2 + %1699 = extractelement <4 x float> %1695, i64 3 + %1700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1690, <4 x float> %1157, i32 0, i32 0, i32 0) + %1701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1691, <4 x float> %1700, i32 0, i32 0, i32 0) + %1702 = extractelement <4 x float> %1701, i64 0 + %1703 = extractelement <4 x float> %1701, i64 1 + %1704 = extractelement <4 x float> %1701, i64 2 + %1705 = extractelement <4 x float> %1701, i64 3 + %1706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1692, <4 x float> %1163, i32 0, i32 0, i32 0) + %1707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1693, <4 x float> %1706, i32 0, i32 0, i32 0) + %1708 = extractelement <4 x float> %1707, i64 0 + %1709 = extractelement <4 x float> %1707, i64 1 + %1710 = extractelement <4 x float> %1707, i64 2 + %1711 = extractelement <4 x float> %1707, i64 3 + %1712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1692, <4 x float> %1169, i32 0, i32 0, i32 0) + %1713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1693, <4 x float> %1712, i32 0, i32 0, i32 0) + %1714 = extractelement <4 x float> %1713, i64 0 + %1715 = extractelement <4 x float> %1713, i64 1 + %1716 = extractelement <4 x float> %1713, i64 2 + %1717 = extractelement <4 x float> %1713, i64 3 + %1718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1690, <4 x float> %1175, i32 0, i32 0, i32 0) + %1719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1691, <4 x float> %1718, i32 0, i32 0, i32 0) + %1720 = extractelement <4 x float> %1719, i64 0 + %1721 = extractelement <4 x float> %1719, i64 1 + %1722 = extractelement <4 x float> %1719, i64 2 + %1723 = extractelement <4 x float> %1719, i64 3 + %1724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1690, <4 x float> %1181, i32 0, i32 0, i32 0) + %1725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1691, <4 x float> %1724, i32 0, i32 0, i32 0) + %1726 = extractelement <4 x float> %1725, i64 0 + %1727 = extractelement <4 x float> %1725, i64 1 + %1728 = extractelement <4 x float> %1725, i64 2 + %1729 = extractelement <4 x float> %1725, i64 3 + %1730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1692, <4 x float> %1187, i32 0, i32 0, i32 0) + %1731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1693, <4 x float> %1730, i32 0, i32 0, i32 0) + %1732 = extractelement <4 x float> %1731, i64 0 + %1733 = extractelement <4 x float> %1731, i64 1 + %1734 = extractelement <4 x float> %1731, i64 2 + %1735 = extractelement <4 x float> %1731, i64 3 + %1736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1692, <4 x float> %1193, i32 0, i32 0, i32 0) + %1737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1693, <4 x float> %1736, i32 0, i32 0, i32 0) + %1738 = extractelement <4 x float> %1737, i64 0 + %1739 = extractelement <4 x float> %1737, i64 1 + %1740 = extractelement <4 x float> %1737, i64 2 + %1741 = extractelement <4 x float> %1737, i64 3 + %1742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1638, <4 x float> %1199, i32 0, i32 0, i32 0) + %1743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1639, <4 x float> %1742, i32 0, i32 0, i32 0) + %1744 = extractelement <4 x float> %1743, i64 0 + %1745 = extractelement <4 x float> %1743, i64 1 + %1746 = extractelement <4 x float> %1743, i64 2 + %1747 = extractelement <4 x float> %1743, i64 3 + %1748 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1638, <4 x float> %1205, i32 0, i32 0, i32 0) + %1749 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1639, <4 x float> %1748, i32 0, i32 0, i32 0) + %1750 = extractelement <4 x float> %1749, i64 0 + %1751 = extractelement <4 x float> %1749, i64 1 + %1752 = extractelement <4 x float> %1749, i64 2 + %1753 = extractelement <4 x float> %1749, i64 3 + %1754 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1640, <4 x float> %1211, i32 0, i32 0, i32 0) + %1755 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1641, <4 x float> %1754, i32 0, i32 0, i32 0) + %1756 = extractelement <4 x float> %1755, i64 0 + %1757 = extractelement <4 x float> %1755, i64 1 + %1758 = extractelement <4 x float> %1755, i64 2 + %1759 = extractelement <4 x float> %1755, i64 3 + %1760 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1640, <4 x float> %1217, i32 0, i32 0, i32 0) + %1761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1641, <4 x float> %1760, i32 0, i32 0, i32 0) + %1762 = extractelement <4 x float> %1761, i64 0 + %1763 = extractelement <4 x float> %1761, i64 1 + %1764 = extractelement <4 x float> %1761, i64 2 + %1765 = extractelement <4 x float> %1761, i64 3 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %1766 = load <8 x half>, ptr addrspace(3) %233, align 16 + %1767 = load <8 x half>, ptr addrspace(3) %235, align 16 + %1768 = load <8 x half>, ptr addrspace(3) %243, align 16 + %1769 = load <8 x half>, ptr addrspace(3) %245, align 16 + %1770 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1638, <4 x float> %1227, i32 0, i32 0, i32 0) + %1771 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1639, <4 x float> %1770, i32 0, i32 0, i32 0) + %1772 = extractelement <4 x float> %1771, i64 0 + %1773 = extractelement <4 x float> %1771, i64 1 + %1774 = extractelement <4 x float> %1771, i64 2 + %1775 = extractelement <4 x float> %1771, i64 3 + %1776 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1638, <4 x float> %1233, i32 0, i32 0, i32 0) + %1777 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1639, <4 x float> %1776, i32 0, i32 0, i32 0) + %1778 = extractelement <4 x float> %1777, i64 0 + %1779 = extractelement <4 x float> %1777, i64 1 + %1780 = extractelement <4 x float> %1777, i64 2 + %1781 = extractelement <4 x float> %1777, i64 3 + %1782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1640, <4 x float> %1239, i32 0, i32 0, i32 0) + %1783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1641, <4 x float> %1782, i32 0, i32 0, i32 0) + %1784 = extractelement <4 x float> %1783, i64 0 + %1785 = extractelement <4 x float> %1783, i64 1 + %1786 = extractelement <4 x float> %1783, i64 2 + %1787 = extractelement <4 x float> %1783, i64 3 + %1788 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1640, <4 x float> %1245, i32 0, i32 0, i32 0) + %1789 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1641, <4 x float> %1788, i32 0, i32 0, i32 0) + %1790 = extractelement <4 x float> %1789, i64 0 + %1791 = extractelement <4 x float> %1789, i64 1 + %1792 = extractelement <4 x float> %1789, i64 2 + %1793 = extractelement <4 x float> %1789, i64 3 + %1794 = load <8 x half>, ptr addrspace(3) %251, align 16 + %1795 = load <8 x half>, ptr addrspace(3) %253, align 16 + %1796 = load <8 x half>, ptr addrspace(3) %258, align 16 + %1797 = load <8 x half>, ptr addrspace(3) %260, align 16 + %1798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1690, <4 x float> %1255, i32 0, i32 0, i32 0) + %1799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1691, <4 x float> %1798, i32 0, i32 0, i32 0) + %1800 = extractelement <4 x float> %1799, i64 0 + %1801 = extractelement <4 x float> %1799, i64 1 + %1802 = extractelement <4 x float> %1799, i64 2 + %1803 = extractelement <4 x float> %1799, i64 3 + %1804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1690, <4 x float> %1261, i32 0, i32 0, i32 0) + %1805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1691, <4 x float> %1804, i32 0, i32 0, i32 0) + %1806 = extractelement <4 x float> %1805, i64 0 + %1807 = extractelement <4 x float> %1805, i64 1 + %1808 = extractelement <4 x float> %1805, i64 2 + %1809 = extractelement <4 x float> %1805, i64 3 + %1810 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1692, <4 x float> %1267, i32 0, i32 0, i32 0) + %1811 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1693, <4 x float> %1810, i32 0, i32 0, i32 0) + %1812 = extractelement <4 x float> %1811, i64 0 + %1813 = extractelement <4 x float> %1811, i64 1 + %1814 = extractelement <4 x float> %1811, i64 2 + %1815 = extractelement <4 x float> %1811, i64 3 + %1816 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1692, <4 x float> %1273, i32 0, i32 0, i32 0) + %1817 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1693, <4 x float> %1816, i32 0, i32 0, i32 0) + %1818 = extractelement <4 x float> %1817, i64 0 + %1819 = extractelement <4 x float> %1817, i64 1 + %1820 = extractelement <4 x float> %1817, i64 2 + %1821 = extractelement <4 x float> %1817, i64 3 + %1822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1690, <4 x float> %1279, i32 0, i32 0, i32 0) + %1823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1691, <4 x float> %1822, i32 0, i32 0, i32 0) + %1824 = extractelement <4 x float> %1823, i64 0 + %1825 = extractelement <4 x float> %1823, i64 1 + %1826 = extractelement <4 x float> %1823, i64 2 + %1827 = extractelement <4 x float> %1823, i64 3 + %1828 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1690, <4 x float> %1285, i32 0, i32 0, i32 0) + %1829 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1691, <4 x float> %1828, i32 0, i32 0, i32 0) + %1830 = extractelement <4 x float> %1829, i64 0 + %1831 = extractelement <4 x float> %1829, i64 1 + %1832 = extractelement <4 x float> %1829, i64 2 + %1833 = extractelement <4 x float> %1829, i64 3 + %1834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1692, <4 x float> %1291, i32 0, i32 0, i32 0) + %1835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1693, <4 x float> %1834, i32 0, i32 0, i32 0) + %1836 = extractelement <4 x float> %1835, i64 0 + %1837 = extractelement <4 x float> %1835, i64 1 + %1838 = extractelement <4 x float> %1835, i64 2 + %1839 = extractelement <4 x float> %1835, i64 3 + %1840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1692, <4 x float> %1297, i32 0, i32 0, i32 0) + %1841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1693, <4 x float> %1840, i32 0, i32 0, i32 0) + %1842 = extractelement <4 x float> %1841, i64 0 + %1843 = extractelement <4 x float> %1841, i64 1 + %1844 = extractelement <4 x float> %1841, i64 2 + %1845 = extractelement <4 x float> %1841, i64 3 + %1846 = add nuw nsw i32 %769, 1 + %exitcond.not = icmp eq i32 %769, %413 + %1847 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1848 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1849 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1850 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1851 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1852 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1853 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1854 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1855 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1856 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1857 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1858 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1859 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1860 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1861 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1862 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1863 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1864 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1865 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1866 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1867 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1868 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1869 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1870 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1871 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1872 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1873 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1874 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1875 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1876 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1877 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + %1878 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + %1879 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1880 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1881 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1882 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> + %1883 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1884 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1885 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1886 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> + %1887 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1888 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1889 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1890 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> + %1891 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1892 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1893 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1894 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> + %1895 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1896 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1897 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1898 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> + %1899 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1900 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1901 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1902 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> + %1903 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1904 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1905 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1906 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> + %1907 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1908 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1909 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1910 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> + %1911 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1912 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> + %1913 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1914 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> + %1915 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1916 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> + %1917 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1918 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> + %1919 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1920 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> + %1921 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1922 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> + %1923 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1924 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> + %1925 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1926 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> + %1927 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1928 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> + %1929 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1930 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> + %1931 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1932 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> + %1933 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1934 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> + %1935 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1936 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> + %1937 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1938 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> + %1939 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1940 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> + %1941 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + %1942 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> + br i1 %exitcond.not, label %._crit_edge, label %510 + +._crit_edge: ; preds = %510, %.._crit_edge_crit_edge + %.pre-phi1068 = phi i32 [ %.pre1067, %.._crit_edge_crit_edge ], [ %410, %510 ] + %.pre-phi1066 = phi i32 [ %.pre1065, %.._crit_edge_crit_edge ], [ %409, %510 ] + %.pre-phi1064 = phi i32 [ %.pre1063, %.._crit_edge_crit_edge ], [ %406, %510 ] + %.pre-phi1062 = phi i32 [ %.pre1061, %.._crit_edge_crit_edge ], [ %405, %510 ] + %.pre-phi1060 = phi i32 [ %.pre1059, %.._crit_edge_crit_edge ], [ %402, %510 ] + %.pre-phi1058 = phi i32 [ %.pre1057, %.._crit_edge_crit_edge ], [ %401, %510 ] + %.pre-phi1056 = phi i32 [ %.pre1055, %.._crit_edge_crit_edge ], [ %398, %510 ] + %.pre-phi1054 = phi i32 [ %.pre1053, %.._crit_edge_crit_edge ], [ %397, %510 ] + %.pre-phi1052 = phi i32 [ %.pre1051, %.._crit_edge_crit_edge ], [ %394, %510 ] + %.pre-phi1050 = phi i32 [ %.pre1049, %.._crit_edge_crit_edge ], [ %393, %510 ] + %.pre-phi1048 = phi i32 [ %.pre1047, %.._crit_edge_crit_edge ], [ %390, %510 ] + %.pre-phi1046 = phi i32 [ %.pre1045, %.._crit_edge_crit_edge ], [ %384, %510 ] + %.pre-phi1044 = phi i32 [ %.pre1043, %.._crit_edge_crit_edge ], [ %383, %510 ] + %.pre-phi1042 = phi i32 [ %.pre1041, %.._crit_edge_crit_edge ], [ %387, %510 ] + %.pre-phi1034 = phi i32 [ %.pre1033, %.._crit_edge_crit_edge ], [ %377, %510 ] + %.pre-phi1030 = phi i32 [ %.pre1029, %.._crit_edge_crit_edge ], [ %375, %510 ] + %.pre-phi1026 = phi i32 [ %.pre1025, %.._crit_edge_crit_edge ], [ %371, %510 ] + %.pre-phi1022 = phi i32 [ %.pre1021, %.._crit_edge_crit_edge ], [ %369, %510 ] + %.pre-phi1018 = phi i32 [ %.pre1017, %.._crit_edge_crit_edge ], [ %365, %510 ] + %.pre-phi1014 = phi i32 [ %.pre1013, %.._crit_edge_crit_edge ], [ %361, %510 ] + %1943 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1308, %510 ], !dbg !167 + %1944 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1309, %510 ], !dbg !167 + %1945 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1310, %510 ], !dbg !167 + %1946 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1311, %510 ], !dbg !167 + %1947 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1314, %510 ], !dbg !167 + %1948 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1315, %510 ], !dbg !167 + %1949 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1316, %510 ], !dbg !167 + %1950 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1317, %510 ], !dbg !167 + %1951 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1320, %510 ], !dbg !167 + %1952 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1321, %510 ], !dbg !167 + %1953 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1322, %510 ], !dbg !167 + %1954 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1323, %510 ], !dbg !167 + %1955 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1326, %510 ], !dbg !167 + %1956 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1327, %510 ], !dbg !167 + %1957 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1328, %510 ], !dbg !167 + %1958 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1329, %510 ], !dbg !167 + %1959 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1340, %510 ], !dbg !168 + %1960 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1341, %510 ], !dbg !168 + %1961 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1342, %510 ], !dbg !168 + %1962 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1343, %510 ], !dbg !168 + %1963 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1346, %510 ], !dbg !168 + %1964 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1347, %510 ], !dbg !168 + %1965 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1348, %510 ], !dbg !168 + %1966 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1349, %510 ], !dbg !168 + %1967 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1352, %510 ], !dbg !168 + %1968 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1353, %510 ], !dbg !168 + %1969 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1354, %510 ], !dbg !168 + %1970 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1355, %510 ], !dbg !168 + %1971 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1358, %510 ], !dbg !168 + %1972 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1359, %510 ], !dbg !168 + %1973 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1360, %510 ], !dbg !168 + %1974 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1361, %510 ], !dbg !168 + %1975 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1482, %510 ], !dbg !169 + %1976 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1483, %510 ], !dbg !169 + %1977 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1484, %510 ], !dbg !169 + %1978 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1485, %510 ], !dbg !169 + %1979 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1488, %510 ], !dbg !169 + %1980 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1489, %510 ], !dbg !169 + %1981 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1490, %510 ], !dbg !169 + %1982 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1491, %510 ], !dbg !169 + %1983 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1494, %510 ], !dbg !169 + %1984 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1495, %510 ], !dbg !169 + %1985 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1496, %510 ], !dbg !169 + %1986 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1497, %510 ], !dbg !169 + %1987 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1500, %510 ], !dbg !169 + %1988 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1501, %510 ], !dbg !169 + %1989 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1502, %510 ], !dbg !169 + %1990 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1503, %510 ], !dbg !169 + %1991 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1510, %510 ], !dbg !170 + %1992 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1511, %510 ], !dbg !170 + %1993 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1512, %510 ], !dbg !170 + %1994 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1513, %510 ], !dbg !170 + %1995 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1516, %510 ], !dbg !170 + %1996 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1517, %510 ], !dbg !170 + %1997 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1518, %510 ], !dbg !170 + %1998 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1519, %510 ], !dbg !170 + %1999 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1522, %510 ], !dbg !170 + %2000 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1523, %510 ], !dbg !170 + %2001 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1524, %510 ], !dbg !170 + %2002 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1525, %510 ], !dbg !170 + %2003 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1528, %510 ], !dbg !170 + %2004 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1529, %510 ], !dbg !170 + %2005 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1530, %510 ], !dbg !170 + %2006 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1531, %510 ], !dbg !170 + %2007 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1372, %510 ], !dbg !171 + %2008 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1373, %510 ], !dbg !171 + %2009 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1374, %510 ], !dbg !171 + %2010 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1375, %510 ], !dbg !171 + %2011 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1378, %510 ], !dbg !171 + %2012 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1379, %510 ], !dbg !171 + %2013 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1380, %510 ], !dbg !171 + %2014 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1381, %510 ], !dbg !171 + %2015 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1384, %510 ], !dbg !171 + %2016 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1385, %510 ], !dbg !171 + %2017 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1386, %510 ], !dbg !171 + %2018 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1387, %510 ], !dbg !171 + %2019 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1390, %510 ], !dbg !171 + %2020 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1391, %510 ], !dbg !171 + %2021 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1392, %510 ], !dbg !171 + %2022 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1393, %510 ], !dbg !171 + %2023 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1396, %510 ], !dbg !172 + %2024 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1397, %510 ], !dbg !172 + %2025 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1398, %510 ], !dbg !172 + %2026 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1399, %510 ], !dbg !172 + %2027 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1402, %510 ], !dbg !172 + %2028 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1403, %510 ], !dbg !172 + %2029 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1404, %510 ], !dbg !172 + %2030 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1405, %510 ], !dbg !172 + %2031 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1408, %510 ], !dbg !172 + %2032 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1409, %510 ], !dbg !172 + %2033 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1410, %510 ], !dbg !172 + %2034 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1411, %510 ], !dbg !172 + %2035 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1414, %510 ], !dbg !172 + %2036 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1415, %510 ], !dbg !172 + %2037 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1416, %510 ], !dbg !172 + %2038 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1417, %510 ], !dbg !172 + %2039 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1534, %510 ], !dbg !173 + %2040 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1535, %510 ], !dbg !173 + %2041 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1536, %510 ], !dbg !173 + %2042 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1537, %510 ], !dbg !173 + %2043 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1540, %510 ], !dbg !173 + %2044 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1541, %510 ], !dbg !173 + %2045 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1542, %510 ], !dbg !173 + %2046 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1543, %510 ], !dbg !173 + %2047 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1546, %510 ], !dbg !173 + %2048 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1547, %510 ], !dbg !173 + %2049 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1548, %510 ], !dbg !173 + %2050 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1549, %510 ], !dbg !173 + %2051 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1552, %510 ], !dbg !173 + %2052 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1553, %510 ], !dbg !173 + %2053 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1554, %510 ], !dbg !173 + %2054 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1555, %510 ], !dbg !173 + %2055 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1558, %510 ], !dbg !174 + %2056 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1559, %510 ], !dbg !174 + %2057 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1560, %510 ], !dbg !174 + %2058 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1561, %510 ], !dbg !174 + %2059 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1564, %510 ], !dbg !174 + %2060 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1565, %510 ], !dbg !174 + %2061 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1566, %510 ], !dbg !174 + %2062 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1567, %510 ], !dbg !174 + %2063 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1570, %510 ], !dbg !174 + %2064 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1571, %510 ], !dbg !174 + %2065 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1572, %510 ], !dbg !174 + %2066 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1573, %510 ], !dbg !174 + %2067 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1576, %510 ], !dbg !174 + %2068 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1577, %510 ], !dbg !174 + %2069 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1578, %510 ], !dbg !174 + %2070 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1579, %510 ], !dbg !174 + %2071 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1644, %510 ], !dbg !175 + %2072 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1645, %510 ], !dbg !175 + %2073 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1646, %510 ], !dbg !175 + %2074 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1647, %510 ], !dbg !175 + %2075 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1650, %510 ], !dbg !175 + %2076 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1651, %510 ], !dbg !175 + %2077 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1652, %510 ], !dbg !175 + %2078 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1653, %510 ], !dbg !175 + %2079 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1656, %510 ], !dbg !175 + %2080 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1657, %510 ], !dbg !175 + %2081 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1658, %510 ], !dbg !175 + %2082 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1659, %510 ], !dbg !175 + %2083 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1662, %510 ], !dbg !175 + %2084 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1663, %510 ], !dbg !175 + %2085 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1664, %510 ], !dbg !175 + %2086 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1665, %510 ], !dbg !175 + %2087 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1668, %510 ], !dbg !176 + %2088 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1669, %510 ], !dbg !176 + %2089 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1670, %510 ], !dbg !176 + %2090 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1671, %510 ], !dbg !176 + %2091 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1674, %510 ], !dbg !176 + %2092 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1675, %510 ], !dbg !176 + %2093 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1676, %510 ], !dbg !176 + %2094 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1677, %510 ], !dbg !176 + %2095 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1680, %510 ], !dbg !176 + %2096 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1681, %510 ], !dbg !176 + %2097 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1682, %510 ], !dbg !176 + %2098 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1683, %510 ], !dbg !176 + %2099 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1686, %510 ], !dbg !176 + %2100 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1687, %510 ], !dbg !176 + %2101 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1688, %510 ], !dbg !176 + %2102 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1689, %510 ], !dbg !176 + %2103 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1744, %510 ], !dbg !177 + %2104 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1745, %510 ], !dbg !177 + %2105 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1746, %510 ], !dbg !177 + %2106 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1747, %510 ], !dbg !177 + %2107 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1750, %510 ], !dbg !177 + %2108 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1751, %510 ], !dbg !177 + %2109 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1752, %510 ], !dbg !177 + %2110 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1753, %510 ], !dbg !177 + %2111 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1756, %510 ], !dbg !177 + %2112 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1757, %510 ], !dbg !177 + %2113 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1758, %510 ], !dbg !177 + %2114 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1759, %510 ], !dbg !177 + %2115 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1762, %510 ], !dbg !177 + %2116 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1763, %510 ], !dbg !177 + %2117 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1764, %510 ], !dbg !177 + %2118 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1765, %510 ], !dbg !177 + %2119 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1772, %510 ], !dbg !178 + %2120 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1773, %510 ], !dbg !178 + %2121 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1774, %510 ], !dbg !178 + %2122 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1775, %510 ], !dbg !178 + %2123 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1778, %510 ], !dbg !178 + %2124 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1779, %510 ], !dbg !178 + %2125 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1780, %510 ], !dbg !178 + %2126 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1781, %510 ], !dbg !178 + %2127 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1784, %510 ], !dbg !178 + %2128 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1785, %510 ], !dbg !178 + %2129 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1786, %510 ], !dbg !178 + %2130 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1787, %510 ], !dbg !178 + %2131 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1790, %510 ], !dbg !178 + %2132 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1791, %510 ], !dbg !178 + %2133 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1792, %510 ], !dbg !178 + %2134 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1793, %510 ], !dbg !178 + %2135 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1696, %510 ], !dbg !179 + %2136 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1697, %510 ], !dbg !179 + %2137 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1698, %510 ], !dbg !179 + %2138 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1699, %510 ], !dbg !179 + %2139 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1702, %510 ], !dbg !179 + %2140 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1703, %510 ], !dbg !179 + %2141 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1704, %510 ], !dbg !179 + %2142 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1705, %510 ], !dbg !179 + %2143 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1708, %510 ], !dbg !179 + %2144 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1709, %510 ], !dbg !179 + %2145 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1710, %510 ], !dbg !179 + %2146 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1711, %510 ], !dbg !179 + %2147 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1714, %510 ], !dbg !179 + %2148 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1715, %510 ], !dbg !179 + %2149 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1716, %510 ], !dbg !179 + %2150 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1717, %510 ], !dbg !179 + %2151 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1720, %510 ], !dbg !180 + %2152 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1721, %510 ], !dbg !180 + %2153 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1722, %510 ], !dbg !180 + %2154 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1723, %510 ], !dbg !180 + %2155 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1726, %510 ], !dbg !180 + %2156 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1727, %510 ], !dbg !180 + %2157 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1728, %510 ], !dbg !180 + %2158 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1729, %510 ], !dbg !180 + %2159 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1732, %510 ], !dbg !180 + %2160 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1733, %510 ], !dbg !180 + %2161 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1734, %510 ], !dbg !180 + %2162 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1735, %510 ], !dbg !180 + %2163 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1738, %510 ], !dbg !180 + %2164 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1739, %510 ], !dbg !180 + %2165 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1740, %510 ], !dbg !180 + %2166 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1741, %510 ], !dbg !180 + %2167 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1800, %510 ], !dbg !181 + %2168 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1801, %510 ], !dbg !181 + %2169 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1802, %510 ], !dbg !181 + %2170 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1803, %510 ], !dbg !181 + %2171 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1806, %510 ], !dbg !181 + %2172 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1807, %510 ], !dbg !181 + %2173 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1808, %510 ], !dbg !181 + %2174 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1809, %510 ], !dbg !181 + %2175 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1812, %510 ], !dbg !181 + %2176 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1813, %510 ], !dbg !181 + %2177 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1814, %510 ], !dbg !181 + %2178 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1815, %510 ], !dbg !181 + %2179 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1818, %510 ], !dbg !181 + %2180 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1819, %510 ], !dbg !181 + %2181 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1820, %510 ], !dbg !181 + %2182 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1821, %510 ], !dbg !181 + %2183 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1824, %510 ], !dbg !182 + %2184 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1825, %510 ], !dbg !182 + %2185 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1826, %510 ], !dbg !182 + %2186 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1827, %510 ], !dbg !182 + %2187 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1830, %510 ], !dbg !182 + %2188 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1831, %510 ], !dbg !182 + %2189 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1832, %510 ], !dbg !182 + %2190 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1833, %510 ], !dbg !182 + %2191 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1836, %510 ], !dbg !182 + %2192 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1837, %510 ], !dbg !182 + %2193 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1838, %510 ], !dbg !182 + %2194 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1839, %510 ], !dbg !182 + %2195 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1842, %510 ], !dbg !182 + %2196 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1843, %510 ], !dbg !182 + %2197 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1844, %510 ], !dbg !182 + %2198 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1845, %510 ], !dbg !182 + %2199 = phi <2 x half> [ %263, %.._crit_edge_crit_edge ], [ %1847, %510 ] + %2200 = phi <2 x half> [ %264, %.._crit_edge_crit_edge ], [ %1848, %510 ] + %2201 = phi <2 x half> [ %265, %.._crit_edge_crit_edge ], [ %1849, %510 ] + %2202 = phi <2 x half> [ %266, %.._crit_edge_crit_edge ], [ %1850, %510 ] + %2203 = phi <2 x half> [ %267, %.._crit_edge_crit_edge ], [ %1851, %510 ] + %2204 = phi <2 x half> [ %268, %.._crit_edge_crit_edge ], [ %1852, %510 ] + %2205 = phi <2 x half> [ %269, %.._crit_edge_crit_edge ], [ %1853, %510 ] + %2206 = phi <2 x half> [ %270, %.._crit_edge_crit_edge ], [ %1854, %510 ] + %2207 = phi <2 x half> [ %271, %.._crit_edge_crit_edge ], [ %1855, %510 ] + %2208 = phi <2 x half> [ %272, %.._crit_edge_crit_edge ], [ %1856, %510 ] + %2209 = phi <2 x half> [ %273, %.._crit_edge_crit_edge ], [ %1857, %510 ] + %2210 = phi <2 x half> [ %274, %.._crit_edge_crit_edge ], [ %1858, %510 ] + %2211 = phi <2 x half> [ %275, %.._crit_edge_crit_edge ], [ %1859, %510 ] + %2212 = phi <2 x half> [ %276, %.._crit_edge_crit_edge ], [ %1860, %510 ] + %2213 = phi <2 x half> [ %277, %.._crit_edge_crit_edge ], [ %1861, %510 ] + %2214 = phi <2 x half> [ %278, %.._crit_edge_crit_edge ], [ %1862, %510 ] + %2215 = phi <2 x half> [ %279, %.._crit_edge_crit_edge ], [ %1863, %510 ] + %2216 = phi <2 x half> [ %280, %.._crit_edge_crit_edge ], [ %1864, %510 ] + %2217 = phi <2 x half> [ %281, %.._crit_edge_crit_edge ], [ %1865, %510 ] + %2218 = phi <2 x half> [ %282, %.._crit_edge_crit_edge ], [ %1866, %510 ] + %2219 = phi <2 x half> [ %283, %.._crit_edge_crit_edge ], [ %1867, %510 ] + %2220 = phi <2 x half> [ %284, %.._crit_edge_crit_edge ], [ %1868, %510 ] + %2221 = phi <2 x half> [ %285, %.._crit_edge_crit_edge ], [ %1869, %510 ] + %2222 = phi <2 x half> [ %286, %.._crit_edge_crit_edge ], [ %1870, %510 ] + %2223 = phi <2 x half> [ %287, %.._crit_edge_crit_edge ], [ %1871, %510 ] + %2224 = phi <2 x half> [ %288, %.._crit_edge_crit_edge ], [ %1872, %510 ] + %2225 = phi <2 x half> [ %289, %.._crit_edge_crit_edge ], [ %1873, %510 ] + %2226 = phi <2 x half> [ %290, %.._crit_edge_crit_edge ], [ %1874, %510 ] + %2227 = phi <2 x half> [ %291, %.._crit_edge_crit_edge ], [ %1875, %510 ] + %2228 = phi <2 x half> [ %292, %.._crit_edge_crit_edge ], [ %1876, %510 ] + %2229 = phi <2 x half> [ %293, %.._crit_edge_crit_edge ], [ %1877, %510 ] + %2230 = phi <2 x half> [ %294, %.._crit_edge_crit_edge ], [ %1878, %510 ] + %2231 = phi <2 x half> [ %295, %.._crit_edge_crit_edge ], [ %1879, %510 ] + %2232 = phi <2 x half> [ %296, %.._crit_edge_crit_edge ], [ %1880, %510 ] + %2233 = phi <2 x half> [ %297, %.._crit_edge_crit_edge ], [ %1881, %510 ] + %2234 = phi <2 x half> [ %298, %.._crit_edge_crit_edge ], [ %1882, %510 ] + %2235 = phi <2 x half> [ %299, %.._crit_edge_crit_edge ], [ %1883, %510 ] + %2236 = phi <2 x half> [ %300, %.._crit_edge_crit_edge ], [ %1884, %510 ] + %2237 = phi <2 x half> [ %301, %.._crit_edge_crit_edge ], [ %1885, %510 ] + %2238 = phi <2 x half> [ %302, %.._crit_edge_crit_edge ], [ %1886, %510 ] + %2239 = phi <2 x half> [ %303, %.._crit_edge_crit_edge ], [ %1887, %510 ] + %2240 = phi <2 x half> [ %304, %.._crit_edge_crit_edge ], [ %1888, %510 ] + %2241 = phi <2 x half> [ %305, %.._crit_edge_crit_edge ], [ %1889, %510 ] + %2242 = phi <2 x half> [ %306, %.._crit_edge_crit_edge ], [ %1890, %510 ] + %2243 = phi <2 x half> [ %307, %.._crit_edge_crit_edge ], [ %1891, %510 ] + %2244 = phi <2 x half> [ %308, %.._crit_edge_crit_edge ], [ %1892, %510 ] + %2245 = phi <2 x half> [ %309, %.._crit_edge_crit_edge ], [ %1893, %510 ] + %2246 = phi <2 x half> [ %310, %.._crit_edge_crit_edge ], [ %1894, %510 ] + %2247 = phi <2 x half> [ %311, %.._crit_edge_crit_edge ], [ %1895, %510 ] + %2248 = phi <2 x half> [ %312, %.._crit_edge_crit_edge ], [ %1896, %510 ] + %2249 = phi <2 x half> [ %313, %.._crit_edge_crit_edge ], [ %1897, %510 ] + %2250 = phi <2 x half> [ %314, %.._crit_edge_crit_edge ], [ %1898, %510 ] + %2251 = phi <2 x half> [ %315, %.._crit_edge_crit_edge ], [ %1899, %510 ] + %2252 = phi <2 x half> [ %316, %.._crit_edge_crit_edge ], [ %1900, %510 ] + %2253 = phi <2 x half> [ %317, %.._crit_edge_crit_edge ], [ %1901, %510 ] + %2254 = phi <2 x half> [ %318, %.._crit_edge_crit_edge ], [ %1902, %510 ] + %2255 = phi <2 x half> [ %319, %.._crit_edge_crit_edge ], [ %1903, %510 ] + %2256 = phi <2 x half> [ %320, %.._crit_edge_crit_edge ], [ %1904, %510 ] + %2257 = phi <2 x half> [ %321, %.._crit_edge_crit_edge ], [ %1905, %510 ] + %2258 = phi <2 x half> [ %322, %.._crit_edge_crit_edge ], [ %1906, %510 ] + %2259 = phi <2 x half> [ %323, %.._crit_edge_crit_edge ], [ %1907, %510 ] + %2260 = phi <2 x half> [ %324, %.._crit_edge_crit_edge ], [ %1908, %510 ] + %2261 = phi <2 x half> [ %325, %.._crit_edge_crit_edge ], [ %1909, %510 ] + %2262 = phi <2 x half> [ %326, %.._crit_edge_crit_edge ], [ %1910, %510 ] + %2263 = phi <2 x half> [ %327, %.._crit_edge_crit_edge ], [ %1911, %510 ] + %2264 = phi <2 x half> [ %328, %.._crit_edge_crit_edge ], [ %1912, %510 ] + %2265 = phi <2 x half> [ %329, %.._crit_edge_crit_edge ], [ %1913, %510 ] + %2266 = phi <2 x half> [ %330, %.._crit_edge_crit_edge ], [ %1914, %510 ] + %2267 = phi <2 x half> [ %331, %.._crit_edge_crit_edge ], [ %1915, %510 ] + %2268 = phi <2 x half> [ %332, %.._crit_edge_crit_edge ], [ %1916, %510 ] + %2269 = phi <2 x half> [ %333, %.._crit_edge_crit_edge ], [ %1917, %510 ] + %2270 = phi <2 x half> [ %334, %.._crit_edge_crit_edge ], [ %1918, %510 ] + %2271 = phi <2 x half> [ %335, %.._crit_edge_crit_edge ], [ %1919, %510 ] + %2272 = phi <2 x half> [ %336, %.._crit_edge_crit_edge ], [ %1920, %510 ] + %2273 = phi <2 x half> [ %337, %.._crit_edge_crit_edge ], [ %1921, %510 ] + %2274 = phi <2 x half> [ %338, %.._crit_edge_crit_edge ], [ %1922, %510 ] + %2275 = phi <2 x half> [ %339, %.._crit_edge_crit_edge ], [ %1923, %510 ] + %2276 = phi <2 x half> [ %340, %.._crit_edge_crit_edge ], [ %1924, %510 ] + %2277 = phi <2 x half> [ %341, %.._crit_edge_crit_edge ], [ %1925, %510 ] + %2278 = phi <2 x half> [ %342, %.._crit_edge_crit_edge ], [ %1926, %510 ] + %2279 = phi <2 x half> [ %343, %.._crit_edge_crit_edge ], [ %1927, %510 ] + %2280 = phi <2 x half> [ %344, %.._crit_edge_crit_edge ], [ %1928, %510 ] + %2281 = phi <2 x half> [ %345, %.._crit_edge_crit_edge ], [ %1929, %510 ] + %2282 = phi <2 x half> [ %346, %.._crit_edge_crit_edge ], [ %1930, %510 ] + %2283 = phi <2 x half> [ %347, %.._crit_edge_crit_edge ], [ %1931, %510 ] + %2284 = phi <2 x half> [ %348, %.._crit_edge_crit_edge ], [ %1932, %510 ] + %2285 = phi <2 x half> [ %349, %.._crit_edge_crit_edge ], [ %1933, %510 ] + %2286 = phi <2 x half> [ %350, %.._crit_edge_crit_edge ], [ %1934, %510 ] + %2287 = phi <2 x half> [ %351, %.._crit_edge_crit_edge ], [ %1935, %510 ] + %2288 = phi <2 x half> [ %352, %.._crit_edge_crit_edge ], [ %1936, %510 ] + %2289 = phi <2 x half> [ %353, %.._crit_edge_crit_edge ], [ %1937, %510 ] + %2290 = phi <2 x half> [ %354, %.._crit_edge_crit_edge ], [ %1938, %510 ] + %2291 = phi <2 x half> [ %355, %.._crit_edge_crit_edge ], [ %1939, %510 ] + %2292 = phi <2 x half> [ %356, %.._crit_edge_crit_edge ], [ %1940, %510 ] + %2293 = phi <2 x half> [ %357, %.._crit_edge_crit_edge ], [ %1941, %510 ] + %2294 = phi <2 x half> [ %358, %.._crit_edge_crit_edge ], [ %1942, %510 ] + %2295 = and i32 %237, 28 + %2296 = or disjoint i32 %2295, 224 + %2297 = or disjoint i32 %2295, 192 + %2298 = or disjoint i32 %2295, 160 + %2299 = or disjoint i32 %2295, 128 + %2300 = or disjoint i32 %2295, 96 + %2301 = or disjoint i32 %2295, 64 + %2302 = or disjoint i32 %2295, 32 + %2303 = or disjoint i32 %225, 224 + %2304 = or disjoint i32 %225, 192 + %2305 = or disjoint i32 %225, 160 + %2306 = or disjoint i32 %225, 128 + %2307 = or disjoint i32 %225, 96 + %2308 = or disjoint i32 %225, 64 + %2309 = or disjoint i32 %225, 32 + %2310 = shufflevector <2 x half> %2231, <2 x half> %2232, <4 x i32> + %2311 = shufflevector <2 x half> %2233, <2 x half> %2234, <4 x i32> + %2312 = shufflevector <2 x half> %2235, <2 x half> %2236, <4 x i32> + %2313 = shufflevector <2 x half> %2237, <2 x half> %2238, <4 x i32> + %2314 = shufflevector <2 x half> %2239, <2 x half> %2240, <4 x i32> + %2315 = shufflevector <2 x half> %2241, <2 x half> %2242, <4 x i32> + %2316 = shufflevector <2 x half> %2243, <2 x half> %2244, <4 x i32> + %2317 = shufflevector <2 x half> %2245, <2 x half> %2246, <4 x i32> + %2318 = insertelement <4 x float> poison, float %1943, i64 0 + %2319 = insertelement <4 x float> %2318, float %1944, i64 1 + %2320 = insertelement <4 x float> %2319, float %1945, i64 2 + %2321 = insertelement <4 x float> %2320, float %1946, i64 3 + %2322 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2310, <4 x float> %2321, i32 0, i32 0, i32 0) + %2323 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2311, <4 x float> %2322, i32 0, i32 0, i32 0) + %2324 = insertelement <4 x float> poison, float %1947, i64 0 + %2325 = insertelement <4 x float> %2324, float %1948, i64 1 + %2326 = insertelement <4 x float> %2325, float %1949, i64 2 + %2327 = insertelement <4 x float> %2326, float %1950, i64 3 + %2328 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2310, <4 x float> %2327, i32 0, i32 0, i32 0) + %2329 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2311, <4 x float> %2328, i32 0, i32 0, i32 0) + %2330 = insertelement <4 x float> poison, float %1951, i64 0 + %2331 = insertelement <4 x float> %2330, float %1952, i64 1 + %2332 = insertelement <4 x float> %2331, float %1953, i64 2 + %2333 = insertelement <4 x float> %2332, float %1954, i64 3 + %2334 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2312, <4 x float> %2333, i32 0, i32 0, i32 0) + %2335 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2313, <4 x float> %2334, i32 0, i32 0, i32 0) + %2336 = insertelement <4 x float> poison, float %1955, i64 0 + %2337 = insertelement <4 x float> %2336, float %1956, i64 1 + %2338 = insertelement <4 x float> %2337, float %1957, i64 2 + %2339 = insertelement <4 x float> %2338, float %1958, i64 3 + %2340 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2312, <4 x float> %2339, i32 0, i32 0, i32 0) + %2341 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2313, <4 x float> %2340, i32 0, i32 0, i32 0) + %2342 = shufflevector <2 x half> %2255, <2 x half> %2256, <4 x i32> + %2343 = shufflevector <2 x half> %2257, <2 x half> %2258, <4 x i32> + %2344 = shufflevector <2 x half> %2259, <2 x half> %2260, <4 x i32> + %2345 = shufflevector <2 x half> %2261, <2 x half> %2262, <4 x i32> + %2346 = insertelement <4 x float> poison, float %1959, i64 0 + %2347 = insertelement <4 x float> %2346, float %1960, i64 1 + %2348 = insertelement <4 x float> %2347, float %1961, i64 2 + %2349 = insertelement <4 x float> %2348, float %1962, i64 3 + %2350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2310, <4 x float> %2349, i32 0, i32 0, i32 0) + %2351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2311, <4 x float> %2350, i32 0, i32 0, i32 0) + %2352 = insertelement <4 x float> poison, float %1963, i64 0 + %2353 = insertelement <4 x float> %2352, float %1964, i64 1 + %2354 = insertelement <4 x float> %2353, float %1965, i64 2 + %2355 = insertelement <4 x float> %2354, float %1966, i64 3 + %2356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2310, <4 x float> %2355, i32 0, i32 0, i32 0) + %2357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2311, <4 x float> %2356, i32 0, i32 0, i32 0) + %2358 = insertelement <4 x float> poison, float %1967, i64 0 + %2359 = insertelement <4 x float> %2358, float %1968, i64 1 + %2360 = insertelement <4 x float> %2359, float %1969, i64 2 + %2361 = insertelement <4 x float> %2360, float %1970, i64 3 + %2362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2312, <4 x float> %2361, i32 0, i32 0, i32 0) + %2363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2313, <4 x float> %2362, i32 0, i32 0, i32 0) + %2364 = insertelement <4 x float> poison, float %1971, i64 0 + %2365 = insertelement <4 x float> %2364, float %1972, i64 1 + %2366 = insertelement <4 x float> %2365, float %1973, i64 2 + %2367 = insertelement <4 x float> %2366, float %1974, i64 3 + %2368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2312, <4 x float> %2367, i32 0, i32 0, i32 0) + %2369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2313, <4 x float> %2368, i32 0, i32 0, i32 0) + %2370 = shufflevector <2 x half> %2247, <2 x half> %2248, <4 x i32> + %2371 = shufflevector <2 x half> %2249, <2 x half> %2250, <4 x i32> + %2372 = shufflevector <2 x half> %2251, <2 x half> %2252, <4 x i32> + %2373 = shufflevector <2 x half> %2253, <2 x half> %2254, <4 x i32> + %2374 = insertelement <4 x float> poison, float %2007, i64 0 + %2375 = insertelement <4 x float> %2374, float %2008, i64 1 + %2376 = insertelement <4 x float> %2375, float %2009, i64 2 + %2377 = insertelement <4 x float> %2376, float %2010, i64 3 + %2378 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2370, <4 x float> %2377, i32 0, i32 0, i32 0) + %2379 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2371, <4 x float> %2378, i32 0, i32 0, i32 0) + %2380 = insertelement <4 x float> poison, float %2011, i64 0 + %2381 = insertelement <4 x float> %2380, float %2012, i64 1 + %2382 = insertelement <4 x float> %2381, float %2013, i64 2 + %2383 = insertelement <4 x float> %2382, float %2014, i64 3 + %2384 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2370, <4 x float> %2383, i32 0, i32 0, i32 0) + %2385 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2371, <4 x float> %2384, i32 0, i32 0, i32 0) + %2386 = insertelement <4 x float> poison, float %2015, i64 0 + %2387 = insertelement <4 x float> %2386, float %2016, i64 1 + %2388 = insertelement <4 x float> %2387, float %2017, i64 2 + %2389 = insertelement <4 x float> %2388, float %2018, i64 3 + %2390 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2372, <4 x float> %2389, i32 0, i32 0, i32 0) + %2391 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2373, <4 x float> %2390, i32 0, i32 0, i32 0) + %2392 = insertelement <4 x float> poison, float %2019, i64 0 + %2393 = insertelement <4 x float> %2392, float %2020, i64 1 + %2394 = insertelement <4 x float> %2393, float %2021, i64 2 + %2395 = insertelement <4 x float> %2394, float %2022, i64 3 + %2396 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2372, <4 x float> %2395, i32 0, i32 0, i32 0) + %2397 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2373, <4 x float> %2396, i32 0, i32 0, i32 0) + %2398 = insertelement <4 x float> poison, float %2023, i64 0 + %2399 = insertelement <4 x float> %2398, float %2024, i64 1 + %2400 = insertelement <4 x float> %2399, float %2025, i64 2 + %2401 = insertelement <4 x float> %2400, float %2026, i64 3 + %2402 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2370, <4 x float> %2401, i32 0, i32 0, i32 0) + %2403 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2371, <4 x float> %2402, i32 0, i32 0, i32 0) + %2404 = insertelement <4 x float> poison, float %2027, i64 0 + %2405 = insertelement <4 x float> %2404, float %2028, i64 1 + %2406 = insertelement <4 x float> %2405, float %2029, i64 2 + %2407 = insertelement <4 x float> %2406, float %2030, i64 3 + %2408 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2370, <4 x float> %2407, i32 0, i32 0, i32 0) + %2409 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2371, <4 x float> %2408, i32 0, i32 0, i32 0) + %2410 = insertelement <4 x float> poison, float %2031, i64 0 + %2411 = insertelement <4 x float> %2410, float %2032, i64 1 + %2412 = insertelement <4 x float> %2411, float %2033, i64 2 + %2413 = insertelement <4 x float> %2412, float %2034, i64 3 + %2414 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2372, <4 x float> %2413, i32 0, i32 0, i32 0) + %2415 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2373, <4 x float> %2414, i32 0, i32 0, i32 0) + %2416 = insertelement <4 x float> poison, float %2035, i64 0 + %2417 = insertelement <4 x float> %2416, float %2036, i64 1 + %2418 = insertelement <4 x float> %2417, float %2037, i64 2 + %2419 = insertelement <4 x float> %2418, float %2038, i64 3 + %2420 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2372, <4 x float> %2419, i32 0, i32 0, i32 0) + %2421 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2373, <4 x float> %2420, i32 0, i32 0, i32 0) + %2422 = or disjoint i32 %.pre-phi1014, 2048 + %2423 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1014 + %2424 = load <8 x half>, ptr addrspace(3) %2423, align 16 + %2425 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2422 + %2426 = load <8 x half>, ptr addrspace(3) %2425, align 16 + %2427 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> + %2428 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> + %2429 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> + %2430 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> + %2431 = insertelement <4 x float> poison, float %1975, i64 0 + %2432 = insertelement <4 x float> %2431, float %1976, i64 1 + %2433 = insertelement <4 x float> %2432, float %1977, i64 2 + %2434 = insertelement <4 x float> %2433, float %1978, i64 3 + %2435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2310, <4 x float> %2434, i32 0, i32 0, i32 0) + %2436 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2311, <4 x float> %2435, i32 0, i32 0, i32 0) + %2437 = insertelement <4 x float> poison, float %1979, i64 0 + %2438 = insertelement <4 x float> %2437, float %1980, i64 1 + %2439 = insertelement <4 x float> %2438, float %1981, i64 2 + %2440 = insertelement <4 x float> %2439, float %1982, i64 3 + %2441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2310, <4 x float> %2440, i32 0, i32 0, i32 0) + %2442 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2311, <4 x float> %2441, i32 0, i32 0, i32 0) + %2443 = insertelement <4 x float> poison, float %1983, i64 0 + %2444 = insertelement <4 x float> %2443, float %1984, i64 1 + %2445 = insertelement <4 x float> %2444, float %1985, i64 2 + %2446 = insertelement <4 x float> %2445, float %1986, i64 3 + %2447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2312, <4 x float> %2446, i32 0, i32 0, i32 0) + %2448 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2313, <4 x float> %2447, i32 0, i32 0, i32 0) + %2449 = insertelement <4 x float> poison, float %1987, i64 0 + %2450 = insertelement <4 x float> %2449, float %1988, i64 1 + %2451 = insertelement <4 x float> %2450, float %1989, i64 2 + %2452 = insertelement <4 x float> %2451, float %1990, i64 3 + %2453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2312, <4 x float> %2452, i32 0, i32 0, i32 0) + %2454 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2313, <4 x float> %2453, i32 0, i32 0, i32 0) + %2455 = or disjoint i32 %.pre-phi1018, 2048 + %2456 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1018 + %2457 = load <8 x half>, ptr addrspace(3) %2456, align 16 + %2458 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2455 + %2459 = load <8 x half>, ptr addrspace(3) %2458, align 16 + %2460 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> + %2461 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> + %2462 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> + %2463 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> + %2464 = insertelement <4 x float> poison, float %1991, i64 0 + %2465 = insertelement <4 x float> %2464, float %1992, i64 1 + %2466 = insertelement <4 x float> %2465, float %1993, i64 2 + %2467 = insertelement <4 x float> %2466, float %1994, i64 3 + %2468 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2310, <4 x float> %2467, i32 0, i32 0, i32 0) + %2469 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2311, <4 x float> %2468, i32 0, i32 0, i32 0) + %2470 = insertelement <4 x float> poison, float %1995, i64 0 + %2471 = insertelement <4 x float> %2470, float %1996, i64 1 + %2472 = insertelement <4 x float> %2471, float %1997, i64 2 + %2473 = insertelement <4 x float> %2472, float %1998, i64 3 + %2474 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2310, <4 x float> %2473, i32 0, i32 0, i32 0) + %2475 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2311, <4 x float> %2474, i32 0, i32 0, i32 0) + %2476 = insertelement <4 x float> poison, float %1999, i64 0 + %2477 = insertelement <4 x float> %2476, float %2000, i64 1 + %2478 = insertelement <4 x float> %2477, float %2001, i64 2 + %2479 = insertelement <4 x float> %2478, float %2002, i64 3 + %2480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2312, <4 x float> %2479, i32 0, i32 0, i32 0) + %2481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2313, <4 x float> %2480, i32 0, i32 0, i32 0) + %2482 = insertelement <4 x float> poison, float %2003, i64 0 + %2483 = insertelement <4 x float> %2482, float %2004, i64 1 + %2484 = insertelement <4 x float> %2483, float %2005, i64 2 + %2485 = insertelement <4 x float> %2484, float %2006, i64 3 + %2486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2312, <4 x float> %2485, i32 0, i32 0, i32 0) + %2487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2313, <4 x float> %2486, i32 0, i32 0, i32 0) + %2488 = insertelement <4 x float> poison, float %2039, i64 0 + %2489 = insertelement <4 x float> %2488, float %2040, i64 1 + %2490 = insertelement <4 x float> %2489, float %2041, i64 2 + %2491 = insertelement <4 x float> %2490, float %2042, i64 3 + %2492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2370, <4 x float> %2491, i32 0, i32 0, i32 0) + %2493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2371, <4 x float> %2492, i32 0, i32 0, i32 0) + %2494 = insertelement <4 x float> poison, float %2043, i64 0 + %2495 = insertelement <4 x float> %2494, float %2044, i64 1 + %2496 = insertelement <4 x float> %2495, float %2045, i64 2 + %2497 = insertelement <4 x float> %2496, float %2046, i64 3 + %2498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2370, <4 x float> %2497, i32 0, i32 0, i32 0) + %2499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2371, <4 x float> %2498, i32 0, i32 0, i32 0) + %2500 = insertelement <4 x float> poison, float %2047, i64 0 + %2501 = insertelement <4 x float> %2500, float %2048, i64 1 + %2502 = insertelement <4 x float> %2501, float %2049, i64 2 + %2503 = insertelement <4 x float> %2502, float %2050, i64 3 + %2504 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2372, <4 x float> %2503, i32 0, i32 0, i32 0) + %2505 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2373, <4 x float> %2504, i32 0, i32 0, i32 0) + %2506 = insertelement <4 x float> poison, float %2051, i64 0 + %2507 = insertelement <4 x float> %2506, float %2052, i64 1 + %2508 = insertelement <4 x float> %2507, float %2053, i64 2 + %2509 = insertelement <4 x float> %2508, float %2054, i64 3 + %2510 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2372, <4 x float> %2509, i32 0, i32 0, i32 0) + %2511 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2373, <4 x float> %2510, i32 0, i32 0, i32 0) + %2512 = insertelement <4 x float> poison, float %2055, i64 0 + %2513 = insertelement <4 x float> %2512, float %2056, i64 1 + %2514 = insertelement <4 x float> %2513, float %2057, i64 2 + %2515 = insertelement <4 x float> %2514, float %2058, i64 3 + %2516 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2370, <4 x float> %2515, i32 0, i32 0, i32 0) + %2517 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2371, <4 x float> %2516, i32 0, i32 0, i32 0) + %2518 = insertelement <4 x float> poison, float %2059, i64 0 + %2519 = insertelement <4 x float> %2518, float %2060, i64 1 + %2520 = insertelement <4 x float> %2519, float %2061, i64 2 + %2521 = insertelement <4 x float> %2520, float %2062, i64 3 + %2522 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2370, <4 x float> %2521, i32 0, i32 0, i32 0) + %2523 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2371, <4 x float> %2522, i32 0, i32 0, i32 0) + %2524 = insertelement <4 x float> poison, float %2063, i64 0 + %2525 = insertelement <4 x float> %2524, float %2064, i64 1 + %2526 = insertelement <4 x float> %2525, float %2065, i64 2 + %2527 = insertelement <4 x float> %2526, float %2066, i64 3 + %2528 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2372, <4 x float> %2527, i32 0, i32 0, i32 0) + %2529 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2373, <4 x float> %2528, i32 0, i32 0, i32 0) + %2530 = insertelement <4 x float> poison, float %2067, i64 0 + %2531 = insertelement <4 x float> %2530, float %2068, i64 1 + %2532 = insertelement <4 x float> %2531, float %2069, i64 2 + %2533 = insertelement <4 x float> %2532, float %2070, i64 3 + %2534 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2372, <4 x float> %2533, i32 0, i32 0, i32 0) + %2535 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2373, <4 x float> %2534, i32 0, i32 0, i32 0) + %2536 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1022 + %2537 = load <8 x half>, ptr addrspace(3) %2536, align 16 + %2538 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1026 + %2539 = load <8 x half>, ptr addrspace(3) %2538, align 16 + %2540 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> + %2541 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> + %2542 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> + %2543 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> + %2544 = insertelement <4 x float> poison, float %2071, i64 0 + %2545 = insertelement <4 x float> %2544, float %2072, i64 1 + %2546 = insertelement <4 x float> %2545, float %2073, i64 2 + %2547 = insertelement <4 x float> %2546, float %2074, i64 3 + %2548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2540, <4 x float> %2547, i32 0, i32 0, i32 0) + %2549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2541, <4 x float> %2548, i32 0, i32 0, i32 0) + %2550 = insertelement <4 x float> poison, float %2075, i64 0 + %2551 = insertelement <4 x float> %2550, float %2076, i64 1 + %2552 = insertelement <4 x float> %2551, float %2077, i64 2 + %2553 = insertelement <4 x float> %2552, float %2078, i64 3 + %2554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2540, <4 x float> %2553, i32 0, i32 0, i32 0) + %2555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2541, <4 x float> %2554, i32 0, i32 0, i32 0) + %2556 = insertelement <4 x float> poison, float %2079, i64 0 + %2557 = insertelement <4 x float> %2556, float %2080, i64 1 + %2558 = insertelement <4 x float> %2557, float %2081, i64 2 + %2559 = insertelement <4 x float> %2558, float %2082, i64 3 + %2560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2542, <4 x float> %2559, i32 0, i32 0, i32 0) + %2561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2543, <4 x float> %2560, i32 0, i32 0, i32 0) + %2562 = insertelement <4 x float> poison, float %2083, i64 0 + %2563 = insertelement <4 x float> %2562, float %2084, i64 1 + %2564 = insertelement <4 x float> %2563, float %2085, i64 2 + %2565 = insertelement <4 x float> %2564, float %2086, i64 3 + %2566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2542, <4 x float> %2565, i32 0, i32 0, i32 0) + %2567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2543, <4 x float> %2566, i32 0, i32 0, i32 0) + %2568 = insertelement <4 x float> poison, float %2087, i64 0 + %2569 = insertelement <4 x float> %2568, float %2088, i64 1 + %2570 = insertelement <4 x float> %2569, float %2089, i64 2 + %2571 = insertelement <4 x float> %2570, float %2090, i64 3 + %2572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2540, <4 x float> %2571, i32 0, i32 0, i32 0) + %2573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2541, <4 x float> %2572, i32 0, i32 0, i32 0) + %2574 = insertelement <4 x float> poison, float %2091, i64 0 + %2575 = insertelement <4 x float> %2574, float %2092, i64 1 + %2576 = insertelement <4 x float> %2575, float %2093, i64 2 + %2577 = insertelement <4 x float> %2576, float %2094, i64 3 + %2578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2540, <4 x float> %2577, i32 0, i32 0, i32 0) + %2579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2541, <4 x float> %2578, i32 0, i32 0, i32 0) + %2580 = insertelement <4 x float> poison, float %2095, i64 0 + %2581 = insertelement <4 x float> %2580, float %2096, i64 1 + %2582 = insertelement <4 x float> %2581, float %2097, i64 2 + %2583 = insertelement <4 x float> %2582, float %2098, i64 3 + %2584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2542, <4 x float> %2583, i32 0, i32 0, i32 0) + %2585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2543, <4 x float> %2584, i32 0, i32 0, i32 0) + %2586 = insertelement <4 x float> poison, float %2099, i64 0 + %2587 = insertelement <4 x float> %2586, float %2100, i64 1 + %2588 = insertelement <4 x float> %2587, float %2101, i64 2 + %2589 = insertelement <4 x float> %2588, float %2102, i64 3 + %2590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2542, <4 x float> %2589, i32 0, i32 0, i32 0) + %2591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2543, <4 x float> %2590, i32 0, i32 0, i32 0) + %2592 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1030 + %2593 = load <8 x half>, ptr addrspace(3) %2592, align 16 + %2594 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1034 + %2595 = load <8 x half>, ptr addrspace(3) %2594, align 16 + %2596 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> + %2597 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> + %2598 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> + %2599 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> + %2600 = insertelement <4 x float> poison, float %2135, i64 0 + %2601 = insertelement <4 x float> %2600, float %2136, i64 1 + %2602 = insertelement <4 x float> %2601, float %2137, i64 2 + %2603 = insertelement <4 x float> %2602, float %2138, i64 3 + %2604 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2596, <4 x float> %2603, i32 0, i32 0, i32 0) + %2605 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2597, <4 x float> %2604, i32 0, i32 0, i32 0) + %2606 = insertelement <4 x float> poison, float %2139, i64 0 + %2607 = insertelement <4 x float> %2606, float %2140, i64 1 + %2608 = insertelement <4 x float> %2607, float %2141, i64 2 + %2609 = insertelement <4 x float> %2608, float %2142, i64 3 + %2610 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2596, <4 x float> %2609, i32 0, i32 0, i32 0) + %2611 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2597, <4 x float> %2610, i32 0, i32 0, i32 0) + %2612 = insertelement <4 x float> poison, float %2143, i64 0 + %2613 = insertelement <4 x float> %2612, float %2144, i64 1 + %2614 = insertelement <4 x float> %2613, float %2145, i64 2 + %2615 = insertelement <4 x float> %2614, float %2146, i64 3 + %2616 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2598, <4 x float> %2615, i32 0, i32 0, i32 0) + %2617 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2599, <4 x float> %2616, i32 0, i32 0, i32 0) + %2618 = insertelement <4 x float> poison, float %2147, i64 0 + %2619 = insertelement <4 x float> %2618, float %2148, i64 1 + %2620 = insertelement <4 x float> %2619, float %2149, i64 2 + %2621 = insertelement <4 x float> %2620, float %2150, i64 3 + %2622 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2598, <4 x float> %2621, i32 0, i32 0, i32 0) + %2623 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2599, <4 x float> %2622, i32 0, i32 0, i32 0) + %2624 = insertelement <4 x float> poison, float %2151, i64 0 + %2625 = insertelement <4 x float> %2624, float %2152, i64 1 + %2626 = insertelement <4 x float> %2625, float %2153, i64 2 + %2627 = insertelement <4 x float> %2626, float %2154, i64 3 + %2628 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2596, <4 x float> %2627, i32 0, i32 0, i32 0) + %2629 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2597, <4 x float> %2628, i32 0, i32 0, i32 0) + %2630 = insertelement <4 x float> poison, float %2155, i64 0 + %2631 = insertelement <4 x float> %2630, float %2156, i64 1 + %2632 = insertelement <4 x float> %2631, float %2157, i64 2 + %2633 = insertelement <4 x float> %2632, float %2158, i64 3 + %2634 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2596, <4 x float> %2633, i32 0, i32 0, i32 0) + %2635 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2597, <4 x float> %2634, i32 0, i32 0, i32 0) + %2636 = insertelement <4 x float> poison, float %2159, i64 0 + %2637 = insertelement <4 x float> %2636, float %2160, i64 1 + %2638 = insertelement <4 x float> %2637, float %2161, i64 2 + %2639 = insertelement <4 x float> %2638, float %2162, i64 3 + %2640 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2598, <4 x float> %2639, i32 0, i32 0, i32 0) + %2641 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2599, <4 x float> %2640, i32 0, i32 0, i32 0) + %2642 = insertelement <4 x float> poison, float %2163, i64 0 + %2643 = insertelement <4 x float> %2642, float %2164, i64 1 + %2644 = insertelement <4 x float> %2643, float %2165, i64 2 + %2645 = insertelement <4 x float> %2644, float %2166, i64 3 + %2646 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2598, <4 x float> %2645, i32 0, i32 0, i32 0) + %2647 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2599, <4 x float> %2646, i32 0, i32 0, i32 0) + %2648 = insertelement <4 x float> poison, float %2103, i64 0 + %2649 = insertelement <4 x float> %2648, float %2104, i64 1 + %2650 = insertelement <4 x float> %2649, float %2105, i64 2 + %2651 = insertelement <4 x float> %2650, float %2106, i64 3 + %2652 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2540, <4 x float> %2651, i32 0, i32 0, i32 0) + %2653 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2541, <4 x float> %2652, i32 0, i32 0, i32 0) + %2654 = insertelement <4 x float> poison, float %2107, i64 0 + %2655 = insertelement <4 x float> %2654, float %2108, i64 1 + %2656 = insertelement <4 x float> %2655, float %2109, i64 2 + %2657 = insertelement <4 x float> %2656, float %2110, i64 3 + %2658 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2540, <4 x float> %2657, i32 0, i32 0, i32 0) + %2659 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2541, <4 x float> %2658, i32 0, i32 0, i32 0) + %2660 = insertelement <4 x float> poison, float %2111, i64 0 + %2661 = insertelement <4 x float> %2660, float %2112, i64 1 + %2662 = insertelement <4 x float> %2661, float %2113, i64 2 + %2663 = insertelement <4 x float> %2662, float %2114, i64 3 + %2664 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2542, <4 x float> %2663, i32 0, i32 0, i32 0) + %2665 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2543, <4 x float> %2664, i32 0, i32 0, i32 0) + %2666 = insertelement <4 x float> poison, float %2115, i64 0 + %2667 = insertelement <4 x float> %2666, float %2116, i64 1 + %2668 = insertelement <4 x float> %2667, float %2117, i64 2 + %2669 = insertelement <4 x float> %2668, float %2118, i64 3 + %2670 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2542, <4 x float> %2669, i32 0, i32 0, i32 0) + %2671 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2543, <4 x float> %2670, i32 0, i32 0, i32 0) + %2672 = insertelement <4 x float> poison, float %2119, i64 0 + %2673 = insertelement <4 x float> %2672, float %2120, i64 1 + %2674 = insertelement <4 x float> %2673, float %2121, i64 2 + %2675 = insertelement <4 x float> %2674, float %2122, i64 3 + %2676 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2540, <4 x float> %2675, i32 0, i32 0, i32 0) + %2677 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2541, <4 x float> %2676, i32 0, i32 0, i32 0) + %2678 = insertelement <4 x float> poison, float %2123, i64 0 + %2679 = insertelement <4 x float> %2678, float %2124, i64 1 + %2680 = insertelement <4 x float> %2679, float %2125, i64 2 + %2681 = insertelement <4 x float> %2680, float %2126, i64 3 + %2682 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2540, <4 x float> %2681, i32 0, i32 0, i32 0) + %2683 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2541, <4 x float> %2682, i32 0, i32 0, i32 0) + %2684 = insertelement <4 x float> poison, float %2127, i64 0 + %2685 = insertelement <4 x float> %2684, float %2128, i64 1 + %2686 = insertelement <4 x float> %2685, float %2129, i64 2 + %2687 = insertelement <4 x float> %2686, float %2130, i64 3 + %2688 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2542, <4 x float> %2687, i32 0, i32 0, i32 0) + %2689 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2543, <4 x float> %2688, i32 0, i32 0, i32 0) + %2690 = insertelement <4 x float> poison, float %2131, i64 0 + %2691 = insertelement <4 x float> %2690, float %2132, i64 1 + %2692 = insertelement <4 x float> %2691, float %2133, i64 2 + %2693 = insertelement <4 x float> %2692, float %2134, i64 3 + %2694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2542, <4 x float> %2693, i32 0, i32 0, i32 0) + %2695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2543, <4 x float> %2694, i32 0, i32 0, i32 0) + %2696 = insertelement <4 x float> poison, float %2167, i64 0 + %2697 = insertelement <4 x float> %2696, float %2168, i64 1 + %2698 = insertelement <4 x float> %2697, float %2169, i64 2 + %2699 = insertelement <4 x float> %2698, float %2170, i64 3 + %2700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2596, <4 x float> %2699, i32 0, i32 0, i32 0) + %2701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2597, <4 x float> %2700, i32 0, i32 0, i32 0) + %2702 = insertelement <4 x float> poison, float %2171, i64 0 + %2703 = insertelement <4 x float> %2702, float %2172, i64 1 + %2704 = insertelement <4 x float> %2703, float %2173, i64 2 + %2705 = insertelement <4 x float> %2704, float %2174, i64 3 + %2706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2596, <4 x float> %2705, i32 0, i32 0, i32 0) + %2707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2597, <4 x float> %2706, i32 0, i32 0, i32 0) + %2708 = insertelement <4 x float> poison, float %2175, i64 0 + %2709 = insertelement <4 x float> %2708, float %2176, i64 1 + %2710 = insertelement <4 x float> %2709, float %2177, i64 2 + %2711 = insertelement <4 x float> %2710, float %2178, i64 3 + %2712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2598, <4 x float> %2711, i32 0, i32 0, i32 0) + %2713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2599, <4 x float> %2712, i32 0, i32 0, i32 0) + %2714 = insertelement <4 x float> poison, float %2179, i64 0 + %2715 = insertelement <4 x float> %2714, float %2180, i64 1 + %2716 = insertelement <4 x float> %2715, float %2181, i64 2 + %2717 = insertelement <4 x float> %2716, float %2182, i64 3 + %2718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2598, <4 x float> %2717, i32 0, i32 0, i32 0) + %2719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2599, <4 x float> %2718, i32 0, i32 0, i32 0) + %2720 = insertelement <4 x float> poison, float %2183, i64 0 + %2721 = insertelement <4 x float> %2720, float %2184, i64 1 + %2722 = insertelement <4 x float> %2721, float %2185, i64 2 + %2723 = insertelement <4 x float> %2722, float %2186, i64 3 + %2724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2596, <4 x float> %2723, i32 0, i32 0, i32 0) + %2725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2597, <4 x float> %2724, i32 0, i32 0, i32 0) + %2726 = insertelement <4 x float> poison, float %2187, i64 0 + %2727 = insertelement <4 x float> %2726, float %2188, i64 1 + %2728 = insertelement <4 x float> %2727, float %2189, i64 2 + %2729 = insertelement <4 x float> %2728, float %2190, i64 3 + %2730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2596, <4 x float> %2729, i32 0, i32 0, i32 0) + %2731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2597, <4 x float> %2730, i32 0, i32 0, i32 0) + %2732 = insertelement <4 x float> poison, float %2191, i64 0 + %2733 = insertelement <4 x float> %2732, float %2192, i64 1 + %2734 = insertelement <4 x float> %2733, float %2193, i64 2 + %2735 = insertelement <4 x float> %2734, float %2194, i64 3 + %2736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2598, <4 x float> %2735, i32 0, i32 0, i32 0) + %2737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2599, <4 x float> %2736, i32 0, i32 0, i32 0) + %2738 = insertelement <4 x float> poison, float %2195, i64 0 + %2739 = insertelement <4 x float> %2738, float %2196, i64 1 + %2740 = insertelement <4 x float> %2739, float %2197, i64 2 + %2741 = insertelement <4 x float> %2740, float %2198, i64 3 + %2742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2598, <4 x float> %2741, i32 0, i32 0, i32 0) + %2743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2599, <4 x float> %2742, i32 0, i32 0, i32 0) + %2744 = or disjoint i32 %.pre-phi1042, 2048 + %2745 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1042 + %2746 = load <8 x half>, ptr addrspace(3) %2745, align 16 + %2747 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2744 + %2748 = load <8 x half>, ptr addrspace(3) %2747, align 16 + %2749 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1044 + %2750 = load <8 x half>, ptr addrspace(3) %2749, align 16 + %2751 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1046 + %2752 = load <8 x half>, ptr addrspace(3) %2751, align 16 + %2753 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> + %2754 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> + %2755 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> + %2756 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> + %2757 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> + %2758 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> + %2759 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> + %2760 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> + %2761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2753, <4 x float> %2323, i32 0, i32 0, i32 0) + %2762 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2754, <4 x float> %2761, i32 0, i32 0, i32 0) + %2763 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2753, <4 x float> %2329, i32 0, i32 0, i32 0) + %2764 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2754, <4 x float> %2763, i32 0, i32 0, i32 0) + %2765 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2755, <4 x float> %2335, i32 0, i32 0, i32 0) + %2766 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2756, <4 x float> %2765, i32 0, i32 0, i32 0) + %2767 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2755, <4 x float> %2341, i32 0, i32 0, i32 0) + %2768 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2756, <4 x float> %2767, i32 0, i32 0, i32 0) + %2769 = or disjoint i32 %.pre-phi1048, 2048 + %2770 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1048 + %2771 = load <8 x half>, ptr addrspace(3) %2770, align 16 + %2772 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2769 + %2773 = load <8 x half>, ptr addrspace(3) %2772, align 16 + %2774 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> + %2775 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> + %2776 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> + %2777 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> + %2778 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2753, <4 x float> %2351, i32 0, i32 0, i32 0) + %2779 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2754, <4 x float> %2778, i32 0, i32 0, i32 0) + %2780 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2753, <4 x float> %2357, i32 0, i32 0, i32 0) + %2781 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2754, <4 x float> %2780, i32 0, i32 0, i32 0) + %2782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2755, <4 x float> %2363, i32 0, i32 0, i32 0) + %2783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2756, <4 x float> %2782, i32 0, i32 0, i32 0) + %2784 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2755, <4 x float> %2369, i32 0, i32 0, i32 0) + %2785 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2756, <4 x float> %2784, i32 0, i32 0, i32 0) + %2786 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1050 + %2787 = load <8 x half>, ptr addrspace(3) %2786, align 16 + %2788 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1052 + %2789 = load <8 x half>, ptr addrspace(3) %2788, align 16 + %2790 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> + %2791 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> + %2792 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> + %2793 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> + %2794 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2790, <4 x float> %2379, i32 0, i32 0, i32 0) + %2795 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2791, <4 x float> %2794, i32 0, i32 0, i32 0) + %2796 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2790, <4 x float> %2385, i32 0, i32 0, i32 0) + %2797 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2791, <4 x float> %2796, i32 0, i32 0, i32 0) + %2798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2792, <4 x float> %2391, i32 0, i32 0, i32 0) + %2799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2793, <4 x float> %2798, i32 0, i32 0, i32 0) + %2800 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2792, <4 x float> %2397, i32 0, i32 0, i32 0) + %2801 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2793, <4 x float> %2800, i32 0, i32 0, i32 0) + %2802 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2790, <4 x float> %2403, i32 0, i32 0, i32 0) + %2803 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2791, <4 x float> %2802, i32 0, i32 0, i32 0) + %2804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2790, <4 x float> %2409, i32 0, i32 0, i32 0) + %2805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2791, <4 x float> %2804, i32 0, i32 0, i32 0) + %2806 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2792, <4 x float> %2415, i32 0, i32 0, i32 0) + %2807 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2793, <4 x float> %2806, i32 0, i32 0, i32 0) + %2808 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2792, <4 x float> %2421, i32 0, i32 0, i32 0) + %2809 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2793, <4 x float> %2808, i32 0, i32 0, i32 0) + %2810 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1054 + %2811 = load <8 x half>, ptr addrspace(3) %2810, align 16 + %2812 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1056 + %2813 = load <8 x half>, ptr addrspace(3) %2812, align 16 + %2814 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> + %2815 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> + %2816 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> + %2817 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> + %2818 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2753, <4 x float> %2436, i32 0, i32 0, i32 0) + %2819 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2754, <4 x float> %2818, i32 0, i32 0, i32 0) + %2820 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2753, <4 x float> %2442, i32 0, i32 0, i32 0) + %2821 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2754, <4 x float> %2820, i32 0, i32 0, i32 0) + %2822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2755, <4 x float> %2448, i32 0, i32 0, i32 0) + %2823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2756, <4 x float> %2822, i32 0, i32 0, i32 0) + %2824 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2755, <4 x float> %2454, i32 0, i32 0, i32 0) + %2825 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2756, <4 x float> %2824, i32 0, i32 0, i32 0) + %2826 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1058 + %2827 = load <8 x half>, ptr addrspace(3) %2826, align 16 + %2828 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1060 + %2829 = load <8 x half>, ptr addrspace(3) %2828, align 16 + %2830 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> + %2831 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> + %2832 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> + %2833 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> + %2834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2753, <4 x float> %2469, i32 0, i32 0, i32 0) + %2835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2754, <4 x float> %2834, i32 0, i32 0, i32 0) + %2836 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2753, <4 x float> %2475, i32 0, i32 0, i32 0) + %2837 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2754, <4 x float> %2836, i32 0, i32 0, i32 0) + %2838 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2755, <4 x float> %2481, i32 0, i32 0, i32 0) + %2839 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2756, <4 x float> %2838, i32 0, i32 0, i32 0) + %2840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2755, <4 x float> %2487, i32 0, i32 0, i32 0) + %2841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2756, <4 x float> %2840, i32 0, i32 0, i32 0) + %2842 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2790, <4 x float> %2493, i32 0, i32 0, i32 0) + %2843 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2791, <4 x float> %2842, i32 0, i32 0, i32 0) + %2844 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2790, <4 x float> %2499, i32 0, i32 0, i32 0) + %2845 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2791, <4 x float> %2844, i32 0, i32 0, i32 0) + %2846 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2792, <4 x float> %2505, i32 0, i32 0, i32 0) + %2847 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2793, <4 x float> %2846, i32 0, i32 0, i32 0) + %2848 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2792, <4 x float> %2511, i32 0, i32 0, i32 0) + %2849 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2793, <4 x float> %2848, i32 0, i32 0, i32 0) + %2850 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2790, <4 x float> %2517, i32 0, i32 0, i32 0) + %2851 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2791, <4 x float> %2850, i32 0, i32 0, i32 0) + %2852 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2790, <4 x float> %2523, i32 0, i32 0, i32 0) + %2853 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2791, <4 x float> %2852, i32 0, i32 0, i32 0) + %2854 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2792, <4 x float> %2529, i32 0, i32 0, i32 0) + %2855 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2793, <4 x float> %2854, i32 0, i32 0, i32 0) + %2856 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2792, <4 x float> %2535, i32 0, i32 0, i32 0) + %2857 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2793, <4 x float> %2856, i32 0, i32 0, i32 0) + %2858 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1062 + %2859 = load <8 x half>, ptr addrspace(3) %2858, align 16 + %2860 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1064 + %2861 = load <8 x half>, ptr addrspace(3) %2860, align 16 + %2862 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> + %2863 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> + %2864 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> + %2865 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> + %2866 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2862, <4 x float> %2549, i32 0, i32 0, i32 0) + %2867 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2863, <4 x float> %2866, i32 0, i32 0, i32 0) + %2868 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2862, <4 x float> %2555, i32 0, i32 0, i32 0) + %2869 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2863, <4 x float> %2868, i32 0, i32 0, i32 0) + %2870 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2864, <4 x float> %2561, i32 0, i32 0, i32 0) + %2871 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2865, <4 x float> %2870, i32 0, i32 0, i32 0) + %2872 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2864, <4 x float> %2567, i32 0, i32 0, i32 0) + %2873 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2865, <4 x float> %2872, i32 0, i32 0, i32 0) + %2874 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2862, <4 x float> %2573, i32 0, i32 0, i32 0) + %2875 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2863, <4 x float> %2874, i32 0, i32 0, i32 0) + %2876 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2862, <4 x float> %2579, i32 0, i32 0, i32 0) + %2877 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2863, <4 x float> %2876, i32 0, i32 0, i32 0) + %2878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2864, <4 x float> %2585, i32 0, i32 0, i32 0) + %2879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2865, <4 x float> %2878, i32 0, i32 0, i32 0) + %2880 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2864, <4 x float> %2591, i32 0, i32 0, i32 0) + %2881 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2865, <4 x float> %2880, i32 0, i32 0, i32 0) + %2882 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1066 + %2883 = load <8 x half>, ptr addrspace(3) %2882, align 16 + %2884 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1068 + %2885 = load <8 x half>, ptr addrspace(3) %2884, align 16 + %2886 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> + %2887 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> + %2888 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> + %2889 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> + %2890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2886, <4 x float> %2605, i32 0, i32 0, i32 0) + %2891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2887, <4 x float> %2890, i32 0, i32 0, i32 0) + %2892 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2886, <4 x float> %2611, i32 0, i32 0, i32 0) + %2893 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2887, <4 x float> %2892, i32 0, i32 0, i32 0) + %2894 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2888, <4 x float> %2617, i32 0, i32 0, i32 0) + %2895 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2889, <4 x float> %2894, i32 0, i32 0, i32 0) + %2896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2888, <4 x float> %2623, i32 0, i32 0, i32 0) + %2897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2889, <4 x float> %2896, i32 0, i32 0, i32 0) + %2898 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2886, <4 x float> %2629, i32 0, i32 0, i32 0) + %2899 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2887, <4 x float> %2898, i32 0, i32 0, i32 0) + %2900 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2886, <4 x float> %2635, i32 0, i32 0, i32 0) + %2901 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2887, <4 x float> %2900, i32 0, i32 0, i32 0) + %2902 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2888, <4 x float> %2641, i32 0, i32 0, i32 0) + %2903 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2889, <4 x float> %2902, i32 0, i32 0, i32 0) + %2904 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2888, <4 x float> %2647, i32 0, i32 0, i32 0) + %2905 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2889, <4 x float> %2904, i32 0, i32 0, i32 0) + %2906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2862, <4 x float> %2653, i32 0, i32 0, i32 0) + %2907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2863, <4 x float> %2906, i32 0, i32 0, i32 0) + %2908 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2862, <4 x float> %2659, i32 0, i32 0, i32 0) + %2909 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2863, <4 x float> %2908, i32 0, i32 0, i32 0) + %2910 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2864, <4 x float> %2665, i32 0, i32 0, i32 0) + %2911 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2865, <4 x float> %2910, i32 0, i32 0, i32 0) + %2912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2864, <4 x float> %2671, i32 0, i32 0, i32 0) + %2913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2865, <4 x float> %2912, i32 0, i32 0, i32 0) + %2914 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2862, <4 x float> %2677, i32 0, i32 0, i32 0) + %2915 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2863, <4 x float> %2914, i32 0, i32 0, i32 0) + %2916 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2862, <4 x float> %2683, i32 0, i32 0, i32 0) + %2917 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2863, <4 x float> %2916, i32 0, i32 0, i32 0) + %2918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2864, <4 x float> %2689, i32 0, i32 0, i32 0) + %2919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2865, <4 x float> %2918, i32 0, i32 0, i32 0) + %2920 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2864, <4 x float> %2695, i32 0, i32 0, i32 0) + %2921 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2865, <4 x float> %2920, i32 0, i32 0, i32 0) + %2922 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2886, <4 x float> %2701, i32 0, i32 0, i32 0) + %2923 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2887, <4 x float> %2922, i32 0, i32 0, i32 0) + %2924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2886, <4 x float> %2707, i32 0, i32 0, i32 0) + %2925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2887, <4 x float> %2924, i32 0, i32 0, i32 0) + %2926 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2888, <4 x float> %2713, i32 0, i32 0, i32 0) + %2927 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2889, <4 x float> %2926, i32 0, i32 0, i32 0) + %2928 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2888, <4 x float> %2719, i32 0, i32 0, i32 0) + %2929 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2889, <4 x float> %2928, i32 0, i32 0, i32 0) + %2930 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2886, <4 x float> %2725, i32 0, i32 0, i32 0) + %2931 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2887, <4 x float> %2930, i32 0, i32 0, i32 0) + %2932 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2886, <4 x float> %2731, i32 0, i32 0, i32 0) + %2933 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2887, <4 x float> %2932, i32 0, i32 0, i32 0) + %2934 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2888, <4 x float> %2737, i32 0, i32 0, i32 0) + %2935 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2889, <4 x float> %2934, i32 0, i32 0, i32 0) + %2936 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2888, <4 x float> %2743, i32 0, i32 0, i32 0) + %2937 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2889, <4 x float> %2936, i32 0, i32 0, i32 0) + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %2938 = shufflevector <2 x half> %2199, <2 x half> %2263, <8 x i32> + %2939 = shufflevector <2 x half> %2264, <2 x half> poison, <8 x i32> + %2940 = shufflevector <8 x half> %2938, <8 x half> %2939, <8 x i32> + %2941 = shufflevector <2 x half> %2200, <2 x half> poison, <8 x i32> + %2942 = shufflevector <8 x half> %2940, <8 x half> %2941, <8 x i32> + store <8 x half> %2942, ptr addrspace(3) %199, align 16 + %2943 = shufflevector <2 x half> %2201, <2 x half> %2265, <8 x i32> + %2944 = shufflevector <2 x half> %2266, <2 x half> poison, <8 x i32> + %2945 = shufflevector <8 x half> %2943, <8 x half> %2944, <8 x i32> + %2946 = shufflevector <2 x half> %2202, <2 x half> poison, <8 x i32> + %2947 = shufflevector <8 x half> %2945, <8 x half> %2946, <8 x i32> + store <8 x half> %2947, ptr addrspace(3) %201, align 16 + %2948 = shufflevector <2 x half> %2203, <2 x half> %2267, <8 x i32> + %2949 = shufflevector <2 x half> %2268, <2 x half> poison, <8 x i32> + %2950 = shufflevector <8 x half> %2948, <8 x half> %2949, <8 x i32> + %2951 = shufflevector <2 x half> %2204, <2 x half> poison, <8 x i32> + %2952 = shufflevector <8 x half> %2950, <8 x half> %2951, <8 x i32> + store <8 x half> %2952, ptr addrspace(3) %203, align 16 + %2953 = shufflevector <2 x half> %2205, <2 x half> %2269, <8 x i32> + %2954 = shufflevector <2 x half> %2270, <2 x half> poison, <8 x i32> + %2955 = shufflevector <8 x half> %2953, <8 x half> %2954, <8 x i32> + %2956 = shufflevector <2 x half> %2206, <2 x half> poison, <8 x i32> + %2957 = shufflevector <8 x half> %2955, <8 x half> %2956, <8 x i32> + store <8 x half> %2957, ptr addrspace(3) %205, align 16 + %2958 = shufflevector <2 x half> %2207, <2 x half> %2271, <8 x i32> + %2959 = shufflevector <2 x half> %2272, <2 x half> poison, <8 x i32> + %2960 = shufflevector <8 x half> %2958, <8 x half> %2959, <8 x i32> + %2961 = shufflevector <2 x half> %2208, <2 x half> poison, <8 x i32> + %2962 = shufflevector <8 x half> %2960, <8 x half> %2961, <8 x i32> + store <8 x half> %2962, ptr addrspace(3) %207, align 16 + %2963 = shufflevector <2 x half> %2209, <2 x half> %2273, <8 x i32> + %2964 = shufflevector <2 x half> %2274, <2 x half> poison, <8 x i32> + %2965 = shufflevector <8 x half> %2963, <8 x half> %2964, <8 x i32> + %2966 = shufflevector <2 x half> %2210, <2 x half> poison, <8 x i32> + %2967 = shufflevector <8 x half> %2965, <8 x half> %2966, <8 x i32> + store <8 x half> %2967, ptr addrspace(3) %209, align 16 + %2968 = shufflevector <2 x half> %2211, <2 x half> %2275, <8 x i32> + %2969 = shufflevector <2 x half> %2276, <2 x half> poison, <8 x i32> + %2970 = shufflevector <8 x half> %2968, <8 x half> %2969, <8 x i32> + %2971 = shufflevector <2 x half> %2212, <2 x half> poison, <8 x i32> + %2972 = shufflevector <8 x half> %2970, <8 x half> %2971, <8 x i32> + store <8 x half> %2972, ptr addrspace(3) %211, align 16 + %2973 = shufflevector <2 x half> %2213, <2 x half> %2277, <8 x i32> + %2974 = shufflevector <2 x half> %2278, <2 x half> poison, <8 x i32> + %2975 = shufflevector <8 x half> %2973, <8 x half> %2974, <8 x i32> + %2976 = shufflevector <2 x half> %2214, <2 x half> poison, <8 x i32> + %2977 = shufflevector <8 x half> %2975, <8 x half> %2976, <8 x i32> + store <8 x half> %2977, ptr addrspace(3) %213, align 16 + %2978 = shufflevector <2 x half> %2215, <2 x half> %2279, <8 x i32> + %2979 = shufflevector <2 x half> %2280, <2 x half> poison, <8 x i32> + %2980 = shufflevector <8 x half> %2978, <8 x half> %2979, <8 x i32> + %2981 = shufflevector <2 x half> %2216, <2 x half> poison, <8 x i32> + %2982 = shufflevector <8 x half> %2980, <8 x half> %2981, <8 x i32> + store <8 x half> %2982, ptr addrspace(3) %214, align 16 + %2983 = shufflevector <2 x half> %2217, <2 x half> %2281, <8 x i32> + %2984 = shufflevector <2 x half> %2282, <2 x half> poison, <8 x i32> + %2985 = shufflevector <8 x half> %2983, <8 x half> %2984, <8 x i32> + %2986 = shufflevector <2 x half> %2218, <2 x half> poison, <8 x i32> + %2987 = shufflevector <8 x half> %2985, <8 x half> %2986, <8 x i32> + store <8 x half> %2987, ptr addrspace(3) %215, align 16 + %2988 = shufflevector <2 x half> %2219, <2 x half> %2283, <8 x i32> + %2989 = shufflevector <2 x half> %2284, <2 x half> poison, <8 x i32> + %2990 = shufflevector <8 x half> %2988, <8 x half> %2989, <8 x i32> + %2991 = shufflevector <2 x half> %2220, <2 x half> poison, <8 x i32> + %2992 = shufflevector <8 x half> %2990, <8 x half> %2991, <8 x i32> + store <8 x half> %2992, ptr addrspace(3) %216, align 16 + %2993 = shufflevector <2 x half> %2221, <2 x half> %2285, <8 x i32> + %2994 = shufflevector <2 x half> %2286, <2 x half> poison, <8 x i32> + %2995 = shufflevector <8 x half> %2993, <8 x half> %2994, <8 x i32> + %2996 = shufflevector <2 x half> %2222, <2 x half> poison, <8 x i32> + %2997 = shufflevector <8 x half> %2995, <8 x half> %2996, <8 x i32> + store <8 x half> %2997, ptr addrspace(3) %217, align 16 + %2998 = shufflevector <2 x half> %2223, <2 x half> %2287, <8 x i32> + %2999 = shufflevector <2 x half> %2288, <2 x half> poison, <8 x i32> + %3000 = shufflevector <8 x half> %2998, <8 x half> %2999, <8 x i32> + %3001 = shufflevector <2 x half> %2224, <2 x half> poison, <8 x i32> + %3002 = shufflevector <8 x half> %3000, <8 x half> %3001, <8 x i32> + store <8 x half> %3002, ptr addrspace(3) %218, align 16 + %3003 = shufflevector <2 x half> %2225, <2 x half> %2289, <8 x i32> + %3004 = shufflevector <2 x half> %2290, <2 x half> poison, <8 x i32> + %3005 = shufflevector <8 x half> %3003, <8 x half> %3004, <8 x i32> + %3006 = shufflevector <2 x half> %2226, <2 x half> poison, <8 x i32> + %3007 = shufflevector <8 x half> %3005, <8 x half> %3006, <8 x i32> + store <8 x half> %3007, ptr addrspace(3) %219, align 16 + %3008 = shufflevector <2 x half> %2227, <2 x half> %2291, <8 x i32> + %3009 = shufflevector <2 x half> %2292, <2 x half> poison, <8 x i32> + %3010 = shufflevector <8 x half> %3008, <8 x half> %3009, <8 x i32> + %3011 = shufflevector <2 x half> %2228, <2 x half> poison, <8 x i32> + %3012 = shufflevector <8 x half> %3010, <8 x half> %3011, <8 x i32> + store <8 x half> %3012, ptr addrspace(3) %220, align 16 + %3013 = shufflevector <2 x half> %2229, <2 x half> %2293, <8 x i32> + %3014 = shufflevector <2 x half> %2294, <2 x half> poison, <8 x i32> + %3015 = shufflevector <8 x half> %3013, <8 x half> %3014, <8 x i32> + %3016 = shufflevector <2 x half> %2230, <2 x half> poison, <8 x i32> + %3017 = shufflevector <8 x half> %3015, <8 x half> %3016, <8 x i32> + store <8 x half> %3017, ptr addrspace(3) %221, align 16 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %3018 = load <8 x half>, ptr addrspace(3) %243, align 16 + %3019 = load <8 x half>, ptr addrspace(3) %245, align 16 + %3020 = load <8 x half>, ptr addrspace(3) %233, align 16 + %3021 = load <8 x half>, ptr addrspace(3) %235, align 16 + %3022 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> + %3023 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> + %3024 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> + %3025 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> + %3026 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> + %3027 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> + %3028 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> + %3029 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> + %3030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3022, <4 x float> %2762, i32 0, i32 0, i32 0) + %3031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3023, <4 x float> %3030, i32 0, i32 0, i32 0) + %3032 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3022, <4 x float> %2764, i32 0, i32 0, i32 0) + %3033 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3023, <4 x float> %3032, i32 0, i32 0, i32 0) + %3034 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3024, <4 x float> %2766, i32 0, i32 0, i32 0) + %3035 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3025, <4 x float> %3034, i32 0, i32 0, i32 0) + %3036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3024, <4 x float> %2768, i32 0, i32 0, i32 0) + %3037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3025, <4 x float> %3036, i32 0, i32 0, i32 0) + %3038 = load <8 x half>, ptr addrspace(3) %258, align 16 + %3039 = load <8 x half>, ptr addrspace(3) %260, align 16 + %3040 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> + %3041 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> + %3042 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> + %3043 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> + %3044 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3022, <4 x float> %2779, i32 0, i32 0, i32 0) + %3045 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3023, <4 x float> %3044, i32 0, i32 0, i32 0) + %3046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3022, <4 x float> %2781, i32 0, i32 0, i32 0) + %3047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3023, <4 x float> %3046, i32 0, i32 0, i32 0) + %3048 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3024, <4 x float> %2783, i32 0, i32 0, i32 0) + %3049 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3025, <4 x float> %3048, i32 0, i32 0, i32 0) + %3050 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3024, <4 x float> %2785, i32 0, i32 0, i32 0) + %3051 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3025, <4 x float> %3050, i32 0, i32 0, i32 0) + %3052 = load <8 x half>, ptr addrspace(3) %2423, align 16 + %3053 = load <8 x half>, ptr addrspace(3) %2425, align 16 + %3054 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> + %3055 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> + %3056 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> + %3057 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> + %3058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3022, <4 x float> %2819, i32 0, i32 0, i32 0) + %3059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3023, <4 x float> %3058, i32 0, i32 0, i32 0) + %3060 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3022, <4 x float> %2821, i32 0, i32 0, i32 0) + %3061 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3023, <4 x float> %3060, i32 0, i32 0, i32 0) + %3062 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3024, <4 x float> %2823, i32 0, i32 0, i32 0) + %3063 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3025, <4 x float> %3062, i32 0, i32 0, i32 0) + %3064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3024, <4 x float> %2825, i32 0, i32 0, i32 0) + %3065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3025, <4 x float> %3064, i32 0, i32 0, i32 0) + %3066 = load <8 x half>, ptr addrspace(3) %2456, align 16 + %3067 = load <8 x half>, ptr addrspace(3) %2458, align 16 + %3068 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> + %3069 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> + %3070 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> + %3071 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> + %3072 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3022, <4 x float> %2835, i32 0, i32 0, i32 0) + %3073 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3023, <4 x float> %3072, i32 0, i32 0, i32 0) + %3074 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3022, <4 x float> %2837, i32 0, i32 0, i32 0) + %3075 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3023, <4 x float> %3074, i32 0, i32 0, i32 0) + %3076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3024, <4 x float> %2839, i32 0, i32 0, i32 0) + %3077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3025, <4 x float> %3076, i32 0, i32 0, i32 0) + %3078 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3024, <4 x float> %2841, i32 0, i32 0, i32 0) + %3079 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3025, <4 x float> %3078, i32 0, i32 0, i32 0) + %3080 = load <8 x half>, ptr addrspace(3) %251, align 16 + %3081 = load <8 x half>, ptr addrspace(3) %253, align 16 + %3082 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> + %3083 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> + %3084 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> + %3085 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> + %3086 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3082, <4 x float> %2795, i32 0, i32 0, i32 0) + %3087 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3083, <4 x float> %3086, i32 0, i32 0, i32 0) + %3088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3082, <4 x float> %2797, i32 0, i32 0, i32 0) + %3089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3083, <4 x float> %3088, i32 0, i32 0, i32 0) + %3090 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3084, <4 x float> %2799, i32 0, i32 0, i32 0) + %3091 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3085, <4 x float> %3090, i32 0, i32 0, i32 0) + %3092 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3084, <4 x float> %2801, i32 0, i32 0, i32 0) + %3093 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3085, <4 x float> %3092, i32 0, i32 0, i32 0) + %3094 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3082, <4 x float> %2803, i32 0, i32 0, i32 0) + %3095 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3083, <4 x float> %3094, i32 0, i32 0, i32 0) + %3096 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3082, <4 x float> %2805, i32 0, i32 0, i32 0) + %3097 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3083, <4 x float> %3096, i32 0, i32 0, i32 0) + %3098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3084, <4 x float> %2807, i32 0, i32 0, i32 0) + %3099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3085, <4 x float> %3098, i32 0, i32 0, i32 0) + %3100 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3084, <4 x float> %2809, i32 0, i32 0, i32 0) + %3101 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3085, <4 x float> %3100, i32 0, i32 0, i32 0) + %3102 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3082, <4 x float> %2843, i32 0, i32 0, i32 0) + %3103 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3083, <4 x float> %3102, i32 0, i32 0, i32 0) + %3104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3082, <4 x float> %2845, i32 0, i32 0, i32 0) + %3105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3083, <4 x float> %3104, i32 0, i32 0, i32 0) + %3106 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3084, <4 x float> %2847, i32 0, i32 0, i32 0) + %3107 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3085, <4 x float> %3106, i32 0, i32 0, i32 0) + %3108 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3084, <4 x float> %2849, i32 0, i32 0, i32 0) + %3109 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3085, <4 x float> %3108, i32 0, i32 0, i32 0) + %3110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3082, <4 x float> %2851, i32 0, i32 0, i32 0) + %3111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3083, <4 x float> %3110, i32 0, i32 0, i32 0) + %3112 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3082, <4 x float> %2853, i32 0, i32 0, i32 0) + %3113 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3083, <4 x float> %3112, i32 0, i32 0, i32 0) + %3114 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3084, <4 x float> %2855, i32 0, i32 0, i32 0) + %3115 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3085, <4 x float> %3114, i32 0, i32 0, i32 0) + %3116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3084, <4 x float> %2857, i32 0, i32 0, i32 0) + %3117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3085, <4 x float> %3116, i32 0, i32 0, i32 0) + %3118 = load <8 x half>, ptr addrspace(3) %2536, align 16 + %3119 = load <8 x half>, ptr addrspace(3) %2538, align 16 + %3120 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> + %3121 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> + %3122 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> + %3123 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> + %3124 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3120, <4 x float> %2867, i32 0, i32 0, i32 0) + %3125 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3121, <4 x float> %3124, i32 0, i32 0, i32 0) + %3126 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3120, <4 x float> %2869, i32 0, i32 0, i32 0) + %3127 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3121, <4 x float> %3126, i32 0, i32 0, i32 0) + %3128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3122, <4 x float> %2871, i32 0, i32 0, i32 0) + %3129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3123, <4 x float> %3128, i32 0, i32 0, i32 0) + %3130 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3122, <4 x float> %2873, i32 0, i32 0, i32 0) + %3131 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3123, <4 x float> %3130, i32 0, i32 0, i32 0) + %3132 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3120, <4 x float> %2875, i32 0, i32 0, i32 0) + %3133 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3121, <4 x float> %3132, i32 0, i32 0, i32 0) + %3134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3120, <4 x float> %2877, i32 0, i32 0, i32 0) + %3135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3121, <4 x float> %3134, i32 0, i32 0, i32 0) + %3136 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3122, <4 x float> %2879, i32 0, i32 0, i32 0) + %3137 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3123, <4 x float> %3136, i32 0, i32 0, i32 0) + %3138 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3122, <4 x float> %2881, i32 0, i32 0, i32 0) + %3139 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3123, <4 x float> %3138, i32 0, i32 0, i32 0) + %3140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3120, <4 x float> %2907, i32 0, i32 0, i32 0) + %3141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3121, <4 x float> %3140, i32 0, i32 0, i32 0) + %3142 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3120, <4 x float> %2909, i32 0, i32 0, i32 0) + %3143 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3121, <4 x float> %3142, i32 0, i32 0, i32 0) + %3144 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3122, <4 x float> %2911, i32 0, i32 0, i32 0) + %3145 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3123, <4 x float> %3144, i32 0, i32 0, i32 0) + %3146 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3122, <4 x float> %2913, i32 0, i32 0, i32 0) + %3147 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3123, <4 x float> %3146, i32 0, i32 0, i32 0) + %3148 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3120, <4 x float> %2915, i32 0, i32 0, i32 0) + %3149 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3121, <4 x float> %3148, i32 0, i32 0, i32 0) + %3150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3120, <4 x float> %2917, i32 0, i32 0, i32 0) + %3151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3121, <4 x float> %3150, i32 0, i32 0, i32 0) + %3152 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3122, <4 x float> %2919, i32 0, i32 0, i32 0) + %3153 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3123, <4 x float> %3152, i32 0, i32 0, i32 0) + %3154 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3122, <4 x float> %2921, i32 0, i32 0, i32 0) + %3155 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3123, <4 x float> %3154, i32 0, i32 0, i32 0) + %3156 = load <8 x half>, ptr addrspace(3) %2592, align 16 + %3157 = load <8 x half>, ptr addrspace(3) %2594, align 16 + %3158 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> + %3159 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> + %3160 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> + %3161 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> + %3162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3158, <4 x float> %2891, i32 0, i32 0, i32 0) + %3163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3159, <4 x float> %3162, i32 0, i32 0, i32 0) + %3164 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3158, <4 x float> %2893, i32 0, i32 0, i32 0) + %3165 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3159, <4 x float> %3164, i32 0, i32 0, i32 0) + %3166 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3160, <4 x float> %2895, i32 0, i32 0, i32 0) + %3167 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3161, <4 x float> %3166, i32 0, i32 0, i32 0) + %3168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3160, <4 x float> %2897, i32 0, i32 0, i32 0) + %3169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3161, <4 x float> %3168, i32 0, i32 0, i32 0) + %3170 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3158, <4 x float> %2899, i32 0, i32 0, i32 0) + %3171 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3159, <4 x float> %3170, i32 0, i32 0, i32 0) + %3172 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3158, <4 x float> %2901, i32 0, i32 0, i32 0) + %3173 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3159, <4 x float> %3172, i32 0, i32 0, i32 0) + %3174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3160, <4 x float> %2903, i32 0, i32 0, i32 0) + %3175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3161, <4 x float> %3174, i32 0, i32 0, i32 0) + %3176 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3160, <4 x float> %2905, i32 0, i32 0, i32 0) + %3177 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3161, <4 x float> %3176, i32 0, i32 0, i32 0) + %3178 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3158, <4 x float> %2923, i32 0, i32 0, i32 0) + %3179 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3159, <4 x float> %3178, i32 0, i32 0, i32 0) + %3180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3158, <4 x float> %2925, i32 0, i32 0, i32 0) + %3181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3159, <4 x float> %3180, i32 0, i32 0, i32 0) + %3182 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3160, <4 x float> %2927, i32 0, i32 0, i32 0) + %3183 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3161, <4 x float> %3182, i32 0, i32 0, i32 0) + %3184 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3160, <4 x float> %2929, i32 0, i32 0, i32 0) + %3185 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3161, <4 x float> %3184, i32 0, i32 0, i32 0) + %3186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3158, <4 x float> %2931, i32 0, i32 0, i32 0) + %3187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3159, <4 x float> %3186, i32 0, i32 0, i32 0) + %3188 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3158, <4 x float> %2933, i32 0, i32 0, i32 0) + %3189 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3159, <4 x float> %3188, i32 0, i32 0, i32 0) + %3190 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3160, <4 x float> %2935, i32 0, i32 0, i32 0) + %3191 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3161, <4 x float> %3190, i32 0, i32 0, i32 0) + %3192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3160, <4 x float> %2937, i32 0, i32 0, i32 0) + %3193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3161, <4 x float> %3192, i32 0, i32 0, i32 0) + %3194 = load <8 x half>, ptr addrspace(3) %2745, align 16 + %3195 = load <8 x half>, ptr addrspace(3) %2747, align 16 + %3196 = load <8 x half>, ptr addrspace(3) %2749, align 16 + %3197 = load <8 x half>, ptr addrspace(3) %2751, align 16 + %3198 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> + %3199 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> + %3200 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> + %3201 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> + %3202 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> + %3203 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> + %3204 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> + %3205 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> + %3206 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3198, <4 x float> %3031, i32 0, i32 0, i32 0) + %3207 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3199, <4 x float> %3206, i32 0, i32 0, i32 0) + %3208 = extractelement <4 x float> %3207, i64 0 + %3209 = extractelement <4 x float> %3207, i64 1 + %3210 = extractelement <4 x float> %3207, i64 2 + %3211 = extractelement <4 x float> %3207, i64 3 + %3212 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3198, <4 x float> %3033, i32 0, i32 0, i32 0) + %3213 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3199, <4 x float> %3212, i32 0, i32 0, i32 0) + %3214 = extractelement <4 x float> %3213, i64 0 + %3215 = extractelement <4 x float> %3213, i64 1 + %3216 = extractelement <4 x float> %3213, i64 2 + %3217 = extractelement <4 x float> %3213, i64 3 + %3218 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3200, <4 x float> %3035, i32 0, i32 0, i32 0) + %3219 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3201, <4 x float> %3218, i32 0, i32 0, i32 0) + %3220 = extractelement <4 x float> %3219, i64 0 + %3221 = extractelement <4 x float> %3219, i64 1 + %3222 = extractelement <4 x float> %3219, i64 2 + %3223 = extractelement <4 x float> %3219, i64 3 + %3224 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3200, <4 x float> %3037, i32 0, i32 0, i32 0) + %3225 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3201, <4 x float> %3224, i32 0, i32 0, i32 0) + %3226 = extractelement <4 x float> %3225, i64 0 + %3227 = extractelement <4 x float> %3225, i64 1 + %3228 = extractelement <4 x float> %3225, i64 2 + %3229 = extractelement <4 x float> %3225, i64 3 + %3230 = load <8 x half>, ptr addrspace(3) %2770, align 16 + %3231 = load <8 x half>, ptr addrspace(3) %2772, align 16 + %3232 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> + %3233 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> + %3234 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> + %3235 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> + %3236 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3198, <4 x float> %3045, i32 0, i32 0, i32 0) + %3237 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3199, <4 x float> %3236, i32 0, i32 0, i32 0) + %3238 = extractelement <4 x float> %3237, i64 0 + %3239 = extractelement <4 x float> %3237, i64 1 + %3240 = extractelement <4 x float> %3237, i64 2 + %3241 = extractelement <4 x float> %3237, i64 3 + %3242 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3198, <4 x float> %3047, i32 0, i32 0, i32 0) + %3243 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3199, <4 x float> %3242, i32 0, i32 0, i32 0) + %3244 = extractelement <4 x float> %3243, i64 0 + %3245 = extractelement <4 x float> %3243, i64 1 + %3246 = extractelement <4 x float> %3243, i64 2 + %3247 = extractelement <4 x float> %3243, i64 3 + %3248 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3200, <4 x float> %3049, i32 0, i32 0, i32 0) + %3249 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3201, <4 x float> %3248, i32 0, i32 0, i32 0) + %3250 = extractelement <4 x float> %3249, i64 0 + %3251 = extractelement <4 x float> %3249, i64 1 + %3252 = extractelement <4 x float> %3249, i64 2 + %3253 = extractelement <4 x float> %3249, i64 3 + %3254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3200, <4 x float> %3051, i32 0, i32 0, i32 0) + %3255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3201, <4 x float> %3254, i32 0, i32 0, i32 0) + %3256 = extractelement <4 x float> %3255, i64 0 + %3257 = extractelement <4 x float> %3255, i64 1 + %3258 = extractelement <4 x float> %3255, i64 2 + %3259 = extractelement <4 x float> %3255, i64 3 + %3260 = load <8 x half>, ptr addrspace(3) %2786, align 16 + %3261 = load <8 x half>, ptr addrspace(3) %2788, align 16 + %3262 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> + %3263 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> + %3264 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> + %3265 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> + %3266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3262, <4 x float> %3087, i32 0, i32 0, i32 0) + %3267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3263, <4 x float> %3266, i32 0, i32 0, i32 0) + %3268 = extractelement <4 x float> %3267, i64 0 + %3269 = extractelement <4 x float> %3267, i64 1 + %3270 = extractelement <4 x float> %3267, i64 2 + %3271 = extractelement <4 x float> %3267, i64 3 + %3272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3262, <4 x float> %3089, i32 0, i32 0, i32 0) + %3273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3263, <4 x float> %3272, i32 0, i32 0, i32 0) + %3274 = extractelement <4 x float> %3273, i64 0 + %3275 = extractelement <4 x float> %3273, i64 1 + %3276 = extractelement <4 x float> %3273, i64 2 + %3277 = extractelement <4 x float> %3273, i64 3 + %3278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3264, <4 x float> %3091, i32 0, i32 0, i32 0) + %3279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3265, <4 x float> %3278, i32 0, i32 0, i32 0) + %3280 = extractelement <4 x float> %3279, i64 0 + %3281 = extractelement <4 x float> %3279, i64 1 + %3282 = extractelement <4 x float> %3279, i64 2 + %3283 = extractelement <4 x float> %3279, i64 3 + %3284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3264, <4 x float> %3093, i32 0, i32 0, i32 0) + %3285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3265, <4 x float> %3284, i32 0, i32 0, i32 0) + %3286 = extractelement <4 x float> %3285, i64 0 + %3287 = extractelement <4 x float> %3285, i64 1 + %3288 = extractelement <4 x float> %3285, i64 2 + %3289 = extractelement <4 x float> %3285, i64 3 + %3290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3262, <4 x float> %3095, i32 0, i32 0, i32 0) + %3291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3263, <4 x float> %3290, i32 0, i32 0, i32 0) + %3292 = extractelement <4 x float> %3291, i64 0 + %3293 = extractelement <4 x float> %3291, i64 1 + %3294 = extractelement <4 x float> %3291, i64 2 + %3295 = extractelement <4 x float> %3291, i64 3 + %3296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3262, <4 x float> %3097, i32 0, i32 0, i32 0) + %3297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3263, <4 x float> %3296, i32 0, i32 0, i32 0) + %3298 = extractelement <4 x float> %3297, i64 0 + %3299 = extractelement <4 x float> %3297, i64 1 + %3300 = extractelement <4 x float> %3297, i64 2 + %3301 = extractelement <4 x float> %3297, i64 3 + %3302 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3264, <4 x float> %3099, i32 0, i32 0, i32 0) + %3303 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3265, <4 x float> %3302, i32 0, i32 0, i32 0) + %3304 = extractelement <4 x float> %3303, i64 0 + %3305 = extractelement <4 x float> %3303, i64 1 + %3306 = extractelement <4 x float> %3303, i64 2 + %3307 = extractelement <4 x float> %3303, i64 3 + %3308 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3264, <4 x float> %3101, i32 0, i32 0, i32 0) + %3309 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3265, <4 x float> %3308, i32 0, i32 0, i32 0) + %3310 = extractelement <4 x float> %3309, i64 0 + %3311 = extractelement <4 x float> %3309, i64 1 + %3312 = extractelement <4 x float> %3309, i64 2 + %3313 = extractelement <4 x float> %3309, i64 3 + %3314 = load <8 x half>, ptr addrspace(3) %2810, align 16 + %3315 = load <8 x half>, ptr addrspace(3) %2812, align 16 + %3316 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> + %3317 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> + %3318 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> + %3319 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> + %3320 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3198, <4 x float> %3059, i32 0, i32 0, i32 0) + %3321 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3199, <4 x float> %3320, i32 0, i32 0, i32 0) + %3322 = extractelement <4 x float> %3321, i64 0 + %3323 = extractelement <4 x float> %3321, i64 1 + %3324 = extractelement <4 x float> %3321, i64 2 + %3325 = extractelement <4 x float> %3321, i64 3 + %3326 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3198, <4 x float> %3061, i32 0, i32 0, i32 0) + %3327 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3199, <4 x float> %3326, i32 0, i32 0, i32 0) + %3328 = extractelement <4 x float> %3327, i64 0 + %3329 = extractelement <4 x float> %3327, i64 1 + %3330 = extractelement <4 x float> %3327, i64 2 + %3331 = extractelement <4 x float> %3327, i64 3 + %3332 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3200, <4 x float> %3063, i32 0, i32 0, i32 0) + %3333 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3201, <4 x float> %3332, i32 0, i32 0, i32 0) + %3334 = extractelement <4 x float> %3333, i64 0 + %3335 = extractelement <4 x float> %3333, i64 1 + %3336 = extractelement <4 x float> %3333, i64 2 + %3337 = extractelement <4 x float> %3333, i64 3 + %3338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3200, <4 x float> %3065, i32 0, i32 0, i32 0) + %3339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3201, <4 x float> %3338, i32 0, i32 0, i32 0) + %3340 = extractelement <4 x float> %3339, i64 0 + %3341 = extractelement <4 x float> %3339, i64 1 + %3342 = extractelement <4 x float> %3339, i64 2 + %3343 = extractelement <4 x float> %3339, i64 3 + %3344 = load <8 x half>, ptr addrspace(3) %2826, align 16 + %3345 = load <8 x half>, ptr addrspace(3) %2828, align 16 + %3346 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> + %3347 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> + %3348 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> + %3349 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> + %3350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3198, <4 x float> %3073, i32 0, i32 0, i32 0) + %3351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3199, <4 x float> %3350, i32 0, i32 0, i32 0) + %3352 = extractelement <4 x float> %3351, i64 0 + %3353 = extractelement <4 x float> %3351, i64 1 + %3354 = extractelement <4 x float> %3351, i64 2 + %3355 = extractelement <4 x float> %3351, i64 3 + %3356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3198, <4 x float> %3075, i32 0, i32 0, i32 0) + %3357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3199, <4 x float> %3356, i32 0, i32 0, i32 0) + %3358 = extractelement <4 x float> %3357, i64 0 + %3359 = extractelement <4 x float> %3357, i64 1 + %3360 = extractelement <4 x float> %3357, i64 2 + %3361 = extractelement <4 x float> %3357, i64 3 + %3362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3200, <4 x float> %3077, i32 0, i32 0, i32 0) + %3363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3201, <4 x float> %3362, i32 0, i32 0, i32 0) + %3364 = extractelement <4 x float> %3363, i64 0 + %3365 = extractelement <4 x float> %3363, i64 1 + %3366 = extractelement <4 x float> %3363, i64 2 + %3367 = extractelement <4 x float> %3363, i64 3 + %3368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3200, <4 x float> %3079, i32 0, i32 0, i32 0) + %3369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3201, <4 x float> %3368, i32 0, i32 0, i32 0) + %3370 = extractelement <4 x float> %3369, i64 0 + %3371 = extractelement <4 x float> %3369, i64 1 + %3372 = extractelement <4 x float> %3369, i64 2 + %3373 = extractelement <4 x float> %3369, i64 3 + %3374 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3262, <4 x float> %3103, i32 0, i32 0, i32 0) + %3375 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3263, <4 x float> %3374, i32 0, i32 0, i32 0) + %3376 = extractelement <4 x float> %3375, i64 0 + %3377 = extractelement <4 x float> %3375, i64 1 + %3378 = extractelement <4 x float> %3375, i64 2 + %3379 = extractelement <4 x float> %3375, i64 3 + %3380 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3262, <4 x float> %3105, i32 0, i32 0, i32 0) + %3381 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3263, <4 x float> %3380, i32 0, i32 0, i32 0) + %3382 = extractelement <4 x float> %3381, i64 0 + %3383 = extractelement <4 x float> %3381, i64 1 + %3384 = extractelement <4 x float> %3381, i64 2 + %3385 = extractelement <4 x float> %3381, i64 3 + %3386 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3264, <4 x float> %3107, i32 0, i32 0, i32 0) + %3387 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3265, <4 x float> %3386, i32 0, i32 0, i32 0) + %3388 = extractelement <4 x float> %3387, i64 0 + %3389 = extractelement <4 x float> %3387, i64 1 + %3390 = extractelement <4 x float> %3387, i64 2 + %3391 = extractelement <4 x float> %3387, i64 3 + %3392 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3264, <4 x float> %3109, i32 0, i32 0, i32 0) + %3393 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3265, <4 x float> %3392, i32 0, i32 0, i32 0) + %3394 = extractelement <4 x float> %3393, i64 0 + %3395 = extractelement <4 x float> %3393, i64 1 + %3396 = extractelement <4 x float> %3393, i64 2 + %3397 = extractelement <4 x float> %3393, i64 3 + %3398 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3262, <4 x float> %3111, i32 0, i32 0, i32 0) + %3399 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3263, <4 x float> %3398, i32 0, i32 0, i32 0) + %3400 = extractelement <4 x float> %3399, i64 0 + %3401 = extractelement <4 x float> %3399, i64 1 + %3402 = extractelement <4 x float> %3399, i64 2 + %3403 = extractelement <4 x float> %3399, i64 3 + %3404 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3262, <4 x float> %3113, i32 0, i32 0, i32 0) + %3405 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3263, <4 x float> %3404, i32 0, i32 0, i32 0) + %3406 = extractelement <4 x float> %3405, i64 0 + %3407 = extractelement <4 x float> %3405, i64 1 + %3408 = extractelement <4 x float> %3405, i64 2 + %3409 = extractelement <4 x float> %3405, i64 3 + %3410 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3264, <4 x float> %3115, i32 0, i32 0, i32 0) + %3411 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3265, <4 x float> %3410, i32 0, i32 0, i32 0) + %3412 = extractelement <4 x float> %3411, i64 0 + %3413 = extractelement <4 x float> %3411, i64 1 + %3414 = extractelement <4 x float> %3411, i64 2 + %3415 = extractelement <4 x float> %3411, i64 3 + %3416 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3264, <4 x float> %3117, i32 0, i32 0, i32 0) + %3417 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3265, <4 x float> %3416, i32 0, i32 0, i32 0) + %3418 = extractelement <4 x float> %3417, i64 0 + %3419 = extractelement <4 x float> %3417, i64 1 + %3420 = extractelement <4 x float> %3417, i64 2 + %3421 = extractelement <4 x float> %3417, i64 3 + %3422 = load <8 x half>, ptr addrspace(3) %2858, align 16 + %3423 = load <8 x half>, ptr addrspace(3) %2860, align 16 + %3424 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> + %3425 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> + %3426 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> + %3427 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> + %3428 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3424, <4 x float> %3125, i32 0, i32 0, i32 0) + %3429 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3425, <4 x float> %3428, i32 0, i32 0, i32 0) + %3430 = extractelement <4 x float> %3429, i64 0 + %3431 = extractelement <4 x float> %3429, i64 1 + %3432 = extractelement <4 x float> %3429, i64 2 + %3433 = extractelement <4 x float> %3429, i64 3 + %3434 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3424, <4 x float> %3127, i32 0, i32 0, i32 0) + %3435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3425, <4 x float> %3434, i32 0, i32 0, i32 0) + %3436 = extractelement <4 x float> %3435, i64 0 + %3437 = extractelement <4 x float> %3435, i64 1 + %3438 = extractelement <4 x float> %3435, i64 2 + %3439 = extractelement <4 x float> %3435, i64 3 + %3440 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3426, <4 x float> %3129, i32 0, i32 0, i32 0) + %3441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3427, <4 x float> %3440, i32 0, i32 0, i32 0) + %3442 = extractelement <4 x float> %3441, i64 0 + %3443 = extractelement <4 x float> %3441, i64 1 + %3444 = extractelement <4 x float> %3441, i64 2 + %3445 = extractelement <4 x float> %3441, i64 3 + %3446 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3426, <4 x float> %3131, i32 0, i32 0, i32 0) + %3447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3427, <4 x float> %3446, i32 0, i32 0, i32 0) + %3448 = extractelement <4 x float> %3447, i64 0 + %3449 = extractelement <4 x float> %3447, i64 1 + %3450 = extractelement <4 x float> %3447, i64 2 + %3451 = extractelement <4 x float> %3447, i64 3 + %3452 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3424, <4 x float> %3133, i32 0, i32 0, i32 0) + %3453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3425, <4 x float> %3452, i32 0, i32 0, i32 0) + %3454 = extractelement <4 x float> %3453, i64 0 + %3455 = extractelement <4 x float> %3453, i64 1 + %3456 = extractelement <4 x float> %3453, i64 2 + %3457 = extractelement <4 x float> %3453, i64 3 + %3458 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3424, <4 x float> %3135, i32 0, i32 0, i32 0) + %3459 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3425, <4 x float> %3458, i32 0, i32 0, i32 0) + %3460 = extractelement <4 x float> %3459, i64 0 + %3461 = extractelement <4 x float> %3459, i64 1 + %3462 = extractelement <4 x float> %3459, i64 2 + %3463 = extractelement <4 x float> %3459, i64 3 + %3464 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3426, <4 x float> %3137, i32 0, i32 0, i32 0) + %3465 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3427, <4 x float> %3464, i32 0, i32 0, i32 0) + %3466 = extractelement <4 x float> %3465, i64 0 + %3467 = extractelement <4 x float> %3465, i64 1 + %3468 = extractelement <4 x float> %3465, i64 2 + %3469 = extractelement <4 x float> %3465, i64 3 + %3470 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3426, <4 x float> %3139, i32 0, i32 0, i32 0) + %3471 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3427, <4 x float> %3470, i32 0, i32 0, i32 0) + %3472 = extractelement <4 x float> %3471, i64 0 + %3473 = extractelement <4 x float> %3471, i64 1 + %3474 = extractelement <4 x float> %3471, i64 2 + %3475 = extractelement <4 x float> %3471, i64 3 + %3476 = load <8 x half>, ptr addrspace(3) %2882, align 16 + %3477 = load <8 x half>, ptr addrspace(3) %2884, align 16 + %3478 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> + %3479 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> + %3480 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> + %3481 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> + %3482 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3478, <4 x float> %3163, i32 0, i32 0, i32 0) + %3483 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3479, <4 x float> %3482, i32 0, i32 0, i32 0) + %3484 = extractelement <4 x float> %3483, i64 0 + %3485 = extractelement <4 x float> %3483, i64 1 + %3486 = extractelement <4 x float> %3483, i64 2 + %3487 = extractelement <4 x float> %3483, i64 3 + %3488 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3478, <4 x float> %3165, i32 0, i32 0, i32 0) + %3489 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3479, <4 x float> %3488, i32 0, i32 0, i32 0) + %3490 = extractelement <4 x float> %3489, i64 0 + %3491 = extractelement <4 x float> %3489, i64 1 + %3492 = extractelement <4 x float> %3489, i64 2 + %3493 = extractelement <4 x float> %3489, i64 3 + %3494 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3480, <4 x float> %3167, i32 0, i32 0, i32 0) + %3495 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3481, <4 x float> %3494, i32 0, i32 0, i32 0) + %3496 = extractelement <4 x float> %3495, i64 0 + %3497 = extractelement <4 x float> %3495, i64 1 + %3498 = extractelement <4 x float> %3495, i64 2 + %3499 = extractelement <4 x float> %3495, i64 3 + %3500 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3480, <4 x float> %3169, i32 0, i32 0, i32 0) + %3501 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3481, <4 x float> %3500, i32 0, i32 0, i32 0) + %3502 = extractelement <4 x float> %3501, i64 0 + %3503 = extractelement <4 x float> %3501, i64 1 + %3504 = extractelement <4 x float> %3501, i64 2 + %3505 = extractelement <4 x float> %3501, i64 3 + %3506 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3478, <4 x float> %3171, i32 0, i32 0, i32 0) + %3507 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3479, <4 x float> %3506, i32 0, i32 0, i32 0) + %3508 = extractelement <4 x float> %3507, i64 0 + %3509 = extractelement <4 x float> %3507, i64 1 + %3510 = extractelement <4 x float> %3507, i64 2 + %3511 = extractelement <4 x float> %3507, i64 3 + %3512 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3478, <4 x float> %3173, i32 0, i32 0, i32 0) + %3513 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3479, <4 x float> %3512, i32 0, i32 0, i32 0) + %3514 = extractelement <4 x float> %3513, i64 0 + %3515 = extractelement <4 x float> %3513, i64 1 + %3516 = extractelement <4 x float> %3513, i64 2 + %3517 = extractelement <4 x float> %3513, i64 3 + %3518 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3480, <4 x float> %3175, i32 0, i32 0, i32 0) + %3519 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3481, <4 x float> %3518, i32 0, i32 0, i32 0) + %3520 = extractelement <4 x float> %3519, i64 0 + %3521 = extractelement <4 x float> %3519, i64 1 + %3522 = extractelement <4 x float> %3519, i64 2 + %3523 = extractelement <4 x float> %3519, i64 3 + %3524 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3480, <4 x float> %3177, i32 0, i32 0, i32 0) + %3525 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3481, <4 x float> %3524, i32 0, i32 0, i32 0) + %3526 = extractelement <4 x float> %3525, i64 0 + %3527 = extractelement <4 x float> %3525, i64 1 + %3528 = extractelement <4 x float> %3525, i64 2 + %3529 = extractelement <4 x float> %3525, i64 3 + %3530 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3424, <4 x float> %3141, i32 0, i32 0, i32 0) + %3531 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3425, <4 x float> %3530, i32 0, i32 0, i32 0) + %3532 = extractelement <4 x float> %3531, i64 0 + %3533 = extractelement <4 x float> %3531, i64 1 + %3534 = extractelement <4 x float> %3531, i64 2 + %3535 = extractelement <4 x float> %3531, i64 3 + %3536 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3424, <4 x float> %3143, i32 0, i32 0, i32 0) + %3537 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3425, <4 x float> %3536, i32 0, i32 0, i32 0) + %3538 = extractelement <4 x float> %3537, i64 0 + %3539 = extractelement <4 x float> %3537, i64 1 + %3540 = extractelement <4 x float> %3537, i64 2 + %3541 = extractelement <4 x float> %3537, i64 3 + %3542 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3426, <4 x float> %3145, i32 0, i32 0, i32 0) + %3543 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3427, <4 x float> %3542, i32 0, i32 0, i32 0) + %3544 = extractelement <4 x float> %3543, i64 0 + %3545 = extractelement <4 x float> %3543, i64 1 + %3546 = extractelement <4 x float> %3543, i64 2 + %3547 = extractelement <4 x float> %3543, i64 3 + %3548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3426, <4 x float> %3147, i32 0, i32 0, i32 0) + %3549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3427, <4 x float> %3548, i32 0, i32 0, i32 0) + %3550 = extractelement <4 x float> %3549, i64 0 + %3551 = extractelement <4 x float> %3549, i64 1 + %3552 = extractelement <4 x float> %3549, i64 2 + %3553 = extractelement <4 x float> %3549, i64 3 + %3554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3424, <4 x float> %3149, i32 0, i32 0, i32 0) + %3555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3425, <4 x float> %3554, i32 0, i32 0, i32 0) + %3556 = extractelement <4 x float> %3555, i64 0 + %3557 = extractelement <4 x float> %3555, i64 1 + %3558 = extractelement <4 x float> %3555, i64 2 + %3559 = extractelement <4 x float> %3555, i64 3 + %3560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3424, <4 x float> %3151, i32 0, i32 0, i32 0) + %3561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3425, <4 x float> %3560, i32 0, i32 0, i32 0) + %3562 = extractelement <4 x float> %3561, i64 0 + %3563 = extractelement <4 x float> %3561, i64 1 + %3564 = extractelement <4 x float> %3561, i64 2 + %3565 = extractelement <4 x float> %3561, i64 3 + %3566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3426, <4 x float> %3153, i32 0, i32 0, i32 0) + %3567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3427, <4 x float> %3566, i32 0, i32 0, i32 0) + %3568 = extractelement <4 x float> %3567, i64 0 + %3569 = extractelement <4 x float> %3567, i64 1 + %3570 = extractelement <4 x float> %3567, i64 2 + %3571 = extractelement <4 x float> %3567, i64 3 + %3572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3426, <4 x float> %3155, i32 0, i32 0, i32 0) + %3573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3427, <4 x float> %3572, i32 0, i32 0, i32 0) + %3574 = extractelement <4 x float> %3573, i64 0 + %3575 = extractelement <4 x float> %3573, i64 1 + %3576 = extractelement <4 x float> %3573, i64 2 + %3577 = extractelement <4 x float> %3573, i64 3 + %3578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3478, <4 x float> %3179, i32 0, i32 0, i32 0) + %3579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3479, <4 x float> %3578, i32 0, i32 0, i32 0) + %3580 = extractelement <4 x float> %3579, i64 0 + %3581 = extractelement <4 x float> %3579, i64 1 + %3582 = extractelement <4 x float> %3579, i64 2 + %3583 = extractelement <4 x float> %3579, i64 3 + %3584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3478, <4 x float> %3181, i32 0, i32 0, i32 0) + %3585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3479, <4 x float> %3584, i32 0, i32 0, i32 0) + %3586 = extractelement <4 x float> %3585, i64 0 + %3587 = extractelement <4 x float> %3585, i64 1 + %3588 = extractelement <4 x float> %3585, i64 2 + %3589 = extractelement <4 x float> %3585, i64 3 + %3590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3480, <4 x float> %3183, i32 0, i32 0, i32 0) + %3591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3481, <4 x float> %3590, i32 0, i32 0, i32 0) + %3592 = extractelement <4 x float> %3591, i64 0 + %3593 = extractelement <4 x float> %3591, i64 1 + %3594 = extractelement <4 x float> %3591, i64 2 + %3595 = extractelement <4 x float> %3591, i64 3 + %3596 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3480, <4 x float> %3185, i32 0, i32 0, i32 0) + %3597 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3481, <4 x float> %3596, i32 0, i32 0, i32 0) + %3598 = extractelement <4 x float> %3597, i64 0 + %3599 = extractelement <4 x float> %3597, i64 1 + %3600 = extractelement <4 x float> %3597, i64 2 + %3601 = extractelement <4 x float> %3597, i64 3 + %3602 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3478, <4 x float> %3187, i32 0, i32 0, i32 0) + %3603 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3479, <4 x float> %3602, i32 0, i32 0, i32 0) + %3604 = extractelement <4 x float> %3603, i64 0 + %3605 = extractelement <4 x float> %3603, i64 1 + %3606 = extractelement <4 x float> %3603, i64 2 + %3607 = extractelement <4 x float> %3603, i64 3 + %3608 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3478, <4 x float> %3189, i32 0, i32 0, i32 0) + %3609 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3479, <4 x float> %3608, i32 0, i32 0, i32 0) + %3610 = extractelement <4 x float> %3609, i64 0 + %3611 = extractelement <4 x float> %3609, i64 1 + %3612 = extractelement <4 x float> %3609, i64 2 + %3613 = extractelement <4 x float> %3609, i64 3 + %3614 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3480, <4 x float> %3191, i32 0, i32 0, i32 0) + %3615 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3481, <4 x float> %3614, i32 0, i32 0, i32 0) + %3616 = extractelement <4 x float> %3615, i64 0 + %3617 = extractelement <4 x float> %3615, i64 1 + %3618 = extractelement <4 x float> %3615, i64 2 + %3619 = extractelement <4 x float> %3615, i64 3 + %3620 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3480, <4 x float> %3193, i32 0, i32 0, i32 0) + %3621 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3481, <4 x float> %3620, i32 0, i32 0, i32 0) + %3622 = extractelement <4 x float> %3621, i64 0 + %3623 = extractelement <4 x float> %3621, i64 1 + %3624 = extractelement <4 x float> %3621, i64 2 + %3625 = extractelement <4 x float> %3621, i64 3 + %3626 = mul i32 %31, %9 + %3627 = sext i32 %3626 to i64 + %3628 = getelementptr half, ptr addrspace(1) %2, i64 %3627 + %3629 = sext i32 %118 to i64 + %3630 = getelementptr half, ptr addrspace(1) %3628, i64 %3629 + %3631 = mul i32 %9, %225 + %3632 = mul i32 %9, %2309 + %3633 = mul i32 %9, %2308 + %3634 = mul i32 %9, %2307 + %3635 = mul i32 %9, %2306 + %3636 = mul i32 %9, %2305 + %3637 = mul i32 %9, %2304 + %3638 = mul i32 %9, %2303 + %3639 = add i32 %3631, %2295 + %3640 = add i32 %3631, %2302 + %3641 = add i32 %3632, %2295 + %3642 = add i32 %3632, %2302 + %3643 = fptrunc float %3208 to half + %3644 = fptrunc float %3209 to half + %3645 = fptrunc float %3210 to half + %3646 = fptrunc float %3211 to half + %3647 = fptrunc float %3214 to half + %3648 = fptrunc float %3215 to half + %3649 = fptrunc float %3216 to half + %3650 = fptrunc float %3217 to half + %3651 = fptrunc float %3220 to half + %3652 = fptrunc float %3221 to half + %3653 = fptrunc float %3222 to half + %3654 = fptrunc float %3223 to half + %3655 = fptrunc float %3226 to half + %3656 = fptrunc float %3227 to half + %3657 = fptrunc float %3228 to half + %3658 = fptrunc float %3229 to half + %3659 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %3630, i16 0, i32 2147483646, i32 159744) + %3660 = insertelement <4 x half> poison, half %3643, i64 0 + %3661 = insertelement <4 x half> %3660, half %3644, i64 1 + %3662 = insertelement <4 x half> %3661, half %3645, i64 2 + %3663 = insertelement <4 x half> %3662, half %3646, i64 3 + %3664 = bitcast <4 x half> %3663 to <2 x i32> + %3665 = shl i32 %3639, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3664, ptr addrspace(8) %3659, i32 %3665, i32 0, i32 0) + %3666 = insertelement <4 x half> poison, half %3647, i64 0 + %3667 = insertelement <4 x half> %3666, half %3648, i64 1 + %3668 = insertelement <4 x half> %3667, half %3649, i64 2 + %3669 = insertelement <4 x half> %3668, half %3650, i64 3 + %3670 = bitcast <4 x half> %3669 to <2 x i32> + %3671 = shl i32 %3640, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3670, ptr addrspace(8) %3659, i32 %3671, i32 0, i32 0) + %3672 = insertelement <4 x half> poison, half %3651, i64 0 + %3673 = insertelement <4 x half> %3672, half %3652, i64 1 + %3674 = insertelement <4 x half> %3673, half %3653, i64 2 + %3675 = insertelement <4 x half> %3674, half %3654, i64 3 + %3676 = bitcast <4 x half> %3675 to <2 x i32> + %3677 = shl i32 %3641, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3676, ptr addrspace(8) %3659, i32 %3677, i32 0, i32 0) + %3678 = insertelement <4 x half> poison, half %3655, i64 0 + %3679 = insertelement <4 x half> %3678, half %3656, i64 1 + %3680 = insertelement <4 x half> %3679, half %3657, i64 2 + %3681 = insertelement <4 x half> %3680, half %3658, i64 3 + %3682 = bitcast <4 x half> %3681 to <2 x i32> + %3683 = shl i32 %3642, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3682, ptr addrspace(8) %3659, i32 %3683, i32 0, i32 0) + %3684 = add i32 %3631, %2301 + %3685 = add i32 %3631, %2300 + %3686 = add i32 %3632, %2301 + %3687 = add i32 %3632, %2300 + %3688 = fptrunc float %3238 to half + %3689 = fptrunc float %3239 to half + %3690 = fptrunc float %3240 to half + %3691 = fptrunc float %3241 to half + %3692 = fptrunc float %3244 to half + %3693 = fptrunc float %3245 to half + %3694 = fptrunc float %3246 to half + %3695 = fptrunc float %3247 to half + %3696 = fptrunc float %3250 to half + %3697 = fptrunc float %3251 to half + %3698 = fptrunc float %3252 to half + %3699 = fptrunc float %3253 to half + %3700 = fptrunc float %3256 to half + %3701 = fptrunc float %3257 to half + %3702 = fptrunc float %3258 to half + %3703 = fptrunc float %3259 to half + %3704 = insertelement <4 x half> poison, half %3688, i64 0 + %3705 = insertelement <4 x half> %3704, half %3689, i64 1 + %3706 = insertelement <4 x half> %3705, half %3690, i64 2 + %3707 = insertelement <4 x half> %3706, half %3691, i64 3 + %3708 = bitcast <4 x half> %3707 to <2 x i32> + %3709 = shl i32 %3684, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3708, ptr addrspace(8) %3659, i32 %3709, i32 0, i32 0) + %3710 = insertelement <4 x half> poison, half %3692, i64 0 + %3711 = insertelement <4 x half> %3710, half %3693, i64 1 + %3712 = insertelement <4 x half> %3711, half %3694, i64 2 + %3713 = insertelement <4 x half> %3712, half %3695, i64 3 + %3714 = bitcast <4 x half> %3713 to <2 x i32> + %3715 = shl i32 %3685, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3714, ptr addrspace(8) %3659, i32 %3715, i32 0, i32 0) + %3716 = insertelement <4 x half> poison, half %3696, i64 0 + %3717 = insertelement <4 x half> %3716, half %3697, i64 1 + %3718 = insertelement <4 x half> %3717, half %3698, i64 2 + %3719 = insertelement <4 x half> %3718, half %3699, i64 3 + %3720 = bitcast <4 x half> %3719 to <2 x i32> + %3721 = shl i32 %3686, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3720, ptr addrspace(8) %3659, i32 %3721, i32 0, i32 0) + %3722 = insertelement <4 x half> poison, half %3700, i64 0 + %3723 = insertelement <4 x half> %3722, half %3701, i64 1 + %3724 = insertelement <4 x half> %3723, half %3702, i64 2 + %3725 = insertelement <4 x half> %3724, half %3703, i64 3 + %3726 = bitcast <4 x half> %3725 to <2 x i32> + %3727 = shl i32 %3687, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3726, ptr addrspace(8) %3659, i32 %3727, i32 0, i32 0) + %3728 = add i32 %3631, %2299 + %3729 = add i32 %3631, %2298 + %3730 = add i32 %3632, %2299 + %3731 = add i32 %3632, %2298 + %3732 = fptrunc float %3322 to half + %3733 = fptrunc float %3323 to half + %3734 = fptrunc float %3324 to half + %3735 = fptrunc float %3325 to half + %3736 = fptrunc float %3328 to half + %3737 = fptrunc float %3329 to half + %3738 = fptrunc float %3330 to half + %3739 = fptrunc float %3331 to half + %3740 = fptrunc float %3334 to half + %3741 = fptrunc float %3335 to half + %3742 = fptrunc float %3336 to half + %3743 = fptrunc float %3337 to half + %3744 = fptrunc float %3340 to half + %3745 = fptrunc float %3341 to half + %3746 = fptrunc float %3342 to half + %3747 = fptrunc float %3343 to half + %3748 = insertelement <4 x half> poison, half %3732, i64 0 + %3749 = insertelement <4 x half> %3748, half %3733, i64 1 + %3750 = insertelement <4 x half> %3749, half %3734, i64 2 + %3751 = insertelement <4 x half> %3750, half %3735, i64 3 + %3752 = bitcast <4 x half> %3751 to <2 x i32> + %3753 = shl i32 %3728, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3752, ptr addrspace(8) %3659, i32 %3753, i32 0, i32 0) + %3754 = insertelement <4 x half> poison, half %3736, i64 0 + %3755 = insertelement <4 x half> %3754, half %3737, i64 1 + %3756 = insertelement <4 x half> %3755, half %3738, i64 2 + %3757 = insertelement <4 x half> %3756, half %3739, i64 3 + %3758 = bitcast <4 x half> %3757 to <2 x i32> + %3759 = shl i32 %3729, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3758, ptr addrspace(8) %3659, i32 %3759, i32 0, i32 0) + %3760 = insertelement <4 x half> poison, half %3740, i64 0 + %3761 = insertelement <4 x half> %3760, half %3741, i64 1 + %3762 = insertelement <4 x half> %3761, half %3742, i64 2 + %3763 = insertelement <4 x half> %3762, half %3743, i64 3 + %3764 = bitcast <4 x half> %3763 to <2 x i32> + %3765 = shl i32 %3730, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3764, ptr addrspace(8) %3659, i32 %3765, i32 0, i32 0) + %3766 = insertelement <4 x half> poison, half %3744, i64 0 + %3767 = insertelement <4 x half> %3766, half %3745, i64 1 + %3768 = insertelement <4 x half> %3767, half %3746, i64 2 + %3769 = insertelement <4 x half> %3768, half %3747, i64 3 + %3770 = bitcast <4 x half> %3769 to <2 x i32> + %3771 = shl i32 %3731, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3770, ptr addrspace(8) %3659, i32 %3771, i32 0, i32 0) + %3772 = add i32 %3631, %2297 + %3773 = add i32 %3631, %2296 + %3774 = add i32 %3632, %2297 + %3775 = add i32 %3632, %2296 + %3776 = fptrunc float %3352 to half + %3777 = fptrunc float %3353 to half + %3778 = fptrunc float %3354 to half + %3779 = fptrunc float %3355 to half + %3780 = fptrunc float %3358 to half + %3781 = fptrunc float %3359 to half + %3782 = fptrunc float %3360 to half + %3783 = fptrunc float %3361 to half + %3784 = fptrunc float %3364 to half + %3785 = fptrunc float %3365 to half + %3786 = fptrunc float %3366 to half + %3787 = fptrunc float %3367 to half + %3788 = fptrunc float %3370 to half + %3789 = fptrunc float %3371 to half + %3790 = fptrunc float %3372 to half + %3791 = fptrunc float %3373 to half + %3792 = insertelement <4 x half> poison, half %3776, i64 0 + %3793 = insertelement <4 x half> %3792, half %3777, i64 1 + %3794 = insertelement <4 x half> %3793, half %3778, i64 2 + %3795 = insertelement <4 x half> %3794, half %3779, i64 3 + %3796 = bitcast <4 x half> %3795 to <2 x i32> + %3797 = shl i32 %3772, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3796, ptr addrspace(8) %3659, i32 %3797, i32 0, i32 0) + %3798 = insertelement <4 x half> poison, half %3780, i64 0 + %3799 = insertelement <4 x half> %3798, half %3781, i64 1 + %3800 = insertelement <4 x half> %3799, half %3782, i64 2 + %3801 = insertelement <4 x half> %3800, half %3783, i64 3 + %3802 = bitcast <4 x half> %3801 to <2 x i32> + %3803 = shl i32 %3773, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3802, ptr addrspace(8) %3659, i32 %3803, i32 0, i32 0) + %3804 = insertelement <4 x half> poison, half %3784, i64 0 + %3805 = insertelement <4 x half> %3804, half %3785, i64 1 + %3806 = insertelement <4 x half> %3805, half %3786, i64 2 + %3807 = insertelement <4 x half> %3806, half %3787, i64 3 + %3808 = bitcast <4 x half> %3807 to <2 x i32> + %3809 = shl i32 %3774, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3808, ptr addrspace(8) %3659, i32 %3809, i32 0, i32 0) + %3810 = insertelement <4 x half> poison, half %3788, i64 0 + %3811 = insertelement <4 x half> %3810, half %3789, i64 1 + %3812 = insertelement <4 x half> %3811, half %3790, i64 2 + %3813 = insertelement <4 x half> %3812, half %3791, i64 3 + %3814 = bitcast <4 x half> %3813 to <2 x i32> + %3815 = shl i32 %3775, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3814, ptr addrspace(8) %3659, i32 %3815, i32 0, i32 0) + %3816 = add i32 %3633, %2295 + %3817 = add i32 %3633, %2302 + %3818 = add i32 %3634, %2295 + %3819 = add i32 %3634, %2302 + %3820 = fptrunc float %3268 to half + %3821 = fptrunc float %3269 to half + %3822 = fptrunc float %3270 to half + %3823 = fptrunc float %3271 to half + %3824 = fptrunc float %3274 to half + %3825 = fptrunc float %3275 to half + %3826 = fptrunc float %3276 to half + %3827 = fptrunc float %3277 to half + %3828 = fptrunc float %3280 to half + %3829 = fptrunc float %3281 to half + %3830 = fptrunc float %3282 to half + %3831 = fptrunc float %3283 to half + %3832 = fptrunc float %3286 to half + %3833 = fptrunc float %3287 to half + %3834 = fptrunc float %3288 to half + %3835 = fptrunc float %3289 to half + %3836 = insertelement <4 x half> poison, half %3820, i64 0 + %3837 = insertelement <4 x half> %3836, half %3821, i64 1 + %3838 = insertelement <4 x half> %3837, half %3822, i64 2 + %3839 = insertelement <4 x half> %3838, half %3823, i64 3 + %3840 = bitcast <4 x half> %3839 to <2 x i32> + %3841 = shl i32 %3816, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3840, ptr addrspace(8) %3659, i32 %3841, i32 0, i32 0) + %3842 = insertelement <4 x half> poison, half %3824, i64 0 + %3843 = insertelement <4 x half> %3842, half %3825, i64 1 + %3844 = insertelement <4 x half> %3843, half %3826, i64 2 + %3845 = insertelement <4 x half> %3844, half %3827, i64 3 + %3846 = bitcast <4 x half> %3845 to <2 x i32> + %3847 = shl i32 %3817, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3846, ptr addrspace(8) %3659, i32 %3847, i32 0, i32 0) + %3848 = insertelement <4 x half> poison, half %3828, i64 0 + %3849 = insertelement <4 x half> %3848, half %3829, i64 1 + %3850 = insertelement <4 x half> %3849, half %3830, i64 2 + %3851 = insertelement <4 x half> %3850, half %3831, i64 3 + %3852 = bitcast <4 x half> %3851 to <2 x i32> + %3853 = shl i32 %3818, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3852, ptr addrspace(8) %3659, i32 %3853, i32 0, i32 0) + %3854 = insertelement <4 x half> poison, half %3832, i64 0 + %3855 = insertelement <4 x half> %3854, half %3833, i64 1 + %3856 = insertelement <4 x half> %3855, half %3834, i64 2 + %3857 = insertelement <4 x half> %3856, half %3835, i64 3 + %3858 = bitcast <4 x half> %3857 to <2 x i32> + %3859 = shl i32 %3819, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3858, ptr addrspace(8) %3659, i32 %3859, i32 0, i32 0) + %3860 = add i32 %3633, %2301 + %3861 = add i32 %3633, %2300 + %3862 = add i32 %3634, %2301 + %3863 = add i32 %3634, %2300 + %3864 = fptrunc float %3292 to half + %3865 = fptrunc float %3293 to half + %3866 = fptrunc float %3294 to half + %3867 = fptrunc float %3295 to half + %3868 = fptrunc float %3298 to half + %3869 = fptrunc float %3299 to half + %3870 = fptrunc float %3300 to half + %3871 = fptrunc float %3301 to half + %3872 = fptrunc float %3304 to half + %3873 = fptrunc float %3305 to half + %3874 = fptrunc float %3306 to half + %3875 = fptrunc float %3307 to half + %3876 = fptrunc float %3310 to half + %3877 = fptrunc float %3311 to half + %3878 = fptrunc float %3312 to half + %3879 = fptrunc float %3313 to half + %3880 = insertelement <4 x half> poison, half %3864, i64 0 + %3881 = insertelement <4 x half> %3880, half %3865, i64 1 + %3882 = insertelement <4 x half> %3881, half %3866, i64 2 + %3883 = insertelement <4 x half> %3882, half %3867, i64 3 + %3884 = bitcast <4 x half> %3883 to <2 x i32> + %3885 = shl i32 %3860, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3884, ptr addrspace(8) %3659, i32 %3885, i32 0, i32 0) + %3886 = insertelement <4 x half> poison, half %3868, i64 0 + %3887 = insertelement <4 x half> %3886, half %3869, i64 1 + %3888 = insertelement <4 x half> %3887, half %3870, i64 2 + %3889 = insertelement <4 x half> %3888, half %3871, i64 3 + %3890 = bitcast <4 x half> %3889 to <2 x i32> + %3891 = shl i32 %3861, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3890, ptr addrspace(8) %3659, i32 %3891, i32 0, i32 0) + %3892 = insertelement <4 x half> poison, half %3872, i64 0 + %3893 = insertelement <4 x half> %3892, half %3873, i64 1 + %3894 = insertelement <4 x half> %3893, half %3874, i64 2 + %3895 = insertelement <4 x half> %3894, half %3875, i64 3 + %3896 = bitcast <4 x half> %3895 to <2 x i32> + %3897 = shl i32 %3862, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3896, ptr addrspace(8) %3659, i32 %3897, i32 0, i32 0) + %3898 = insertelement <4 x half> poison, half %3876, i64 0 + %3899 = insertelement <4 x half> %3898, half %3877, i64 1 + %3900 = insertelement <4 x half> %3899, half %3878, i64 2 + %3901 = insertelement <4 x half> %3900, half %3879, i64 3 + %3902 = bitcast <4 x half> %3901 to <2 x i32> + %3903 = shl i32 %3863, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3902, ptr addrspace(8) %3659, i32 %3903, i32 0, i32 0) + %3904 = add i32 %3633, %2299 + %3905 = add i32 %3633, %2298 + %3906 = add i32 %3634, %2299 + %3907 = add i32 %3634, %2298 + %3908 = fptrunc float %3376 to half + %3909 = fptrunc float %3377 to half + %3910 = fptrunc float %3378 to half + %3911 = fptrunc float %3379 to half + %3912 = fptrunc float %3382 to half + %3913 = fptrunc float %3383 to half + %3914 = fptrunc float %3384 to half + %3915 = fptrunc float %3385 to half + %3916 = fptrunc float %3388 to half + %3917 = fptrunc float %3389 to half + %3918 = fptrunc float %3390 to half + %3919 = fptrunc float %3391 to half + %3920 = fptrunc float %3394 to half + %3921 = fptrunc float %3395 to half + %3922 = fptrunc float %3396 to half + %3923 = fptrunc float %3397 to half + %3924 = insertelement <4 x half> poison, half %3908, i64 0 + %3925 = insertelement <4 x half> %3924, half %3909, i64 1 + %3926 = insertelement <4 x half> %3925, half %3910, i64 2 + %3927 = insertelement <4 x half> %3926, half %3911, i64 3 + %3928 = bitcast <4 x half> %3927 to <2 x i32> + %3929 = shl i32 %3904, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3928, ptr addrspace(8) %3659, i32 %3929, i32 0, i32 0) + %3930 = insertelement <4 x half> poison, half %3912, i64 0 + %3931 = insertelement <4 x half> %3930, half %3913, i64 1 + %3932 = insertelement <4 x half> %3931, half %3914, i64 2 + %3933 = insertelement <4 x half> %3932, half %3915, i64 3 + %3934 = bitcast <4 x half> %3933 to <2 x i32> + %3935 = shl i32 %3905, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3934, ptr addrspace(8) %3659, i32 %3935, i32 0, i32 0) + %3936 = insertelement <4 x half> poison, half %3916, i64 0 + %3937 = insertelement <4 x half> %3936, half %3917, i64 1 + %3938 = insertelement <4 x half> %3937, half %3918, i64 2 + %3939 = insertelement <4 x half> %3938, half %3919, i64 3 + %3940 = bitcast <4 x half> %3939 to <2 x i32> + %3941 = shl i32 %3906, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3940, ptr addrspace(8) %3659, i32 %3941, i32 0, i32 0) + %3942 = insertelement <4 x half> poison, half %3920, i64 0 + %3943 = insertelement <4 x half> %3942, half %3921, i64 1 + %3944 = insertelement <4 x half> %3943, half %3922, i64 2 + %3945 = insertelement <4 x half> %3944, half %3923, i64 3 + %3946 = bitcast <4 x half> %3945 to <2 x i32> + %3947 = shl i32 %3907, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3946, ptr addrspace(8) %3659, i32 %3947, i32 0, i32 0) + %3948 = add i32 %3633, %2297 + %3949 = add i32 %3633, %2296 + %3950 = add i32 %3634, %2297 + %3951 = add i32 %3634, %2296 + %3952 = fptrunc float %3400 to half + %3953 = fptrunc float %3401 to half + %3954 = fptrunc float %3402 to half + %3955 = fptrunc float %3403 to half + %3956 = fptrunc float %3406 to half + %3957 = fptrunc float %3407 to half + %3958 = fptrunc float %3408 to half + %3959 = fptrunc float %3409 to half + %3960 = fptrunc float %3412 to half + %3961 = fptrunc float %3413 to half + %3962 = fptrunc float %3414 to half + %3963 = fptrunc float %3415 to half + %3964 = fptrunc float %3418 to half + %3965 = fptrunc float %3419 to half + %3966 = fptrunc float %3420 to half + %3967 = fptrunc float %3421 to half + %3968 = insertelement <4 x half> poison, half %3952, i64 0 + %3969 = insertelement <4 x half> %3968, half %3953, i64 1 + %3970 = insertelement <4 x half> %3969, half %3954, i64 2 + %3971 = insertelement <4 x half> %3970, half %3955, i64 3 + %3972 = bitcast <4 x half> %3971 to <2 x i32> + %3973 = shl i32 %3948, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3972, ptr addrspace(8) %3659, i32 %3973, i32 0, i32 0) + %3974 = insertelement <4 x half> poison, half %3956, i64 0 + %3975 = insertelement <4 x half> %3974, half %3957, i64 1 + %3976 = insertelement <4 x half> %3975, half %3958, i64 2 + %3977 = insertelement <4 x half> %3976, half %3959, i64 3 + %3978 = bitcast <4 x half> %3977 to <2 x i32> + %3979 = shl i32 %3949, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3978, ptr addrspace(8) %3659, i32 %3979, i32 0, i32 0) + %3980 = insertelement <4 x half> poison, half %3960, i64 0 + %3981 = insertelement <4 x half> %3980, half %3961, i64 1 + %3982 = insertelement <4 x half> %3981, half %3962, i64 2 + %3983 = insertelement <4 x half> %3982, half %3963, i64 3 + %3984 = bitcast <4 x half> %3983 to <2 x i32> + %3985 = shl i32 %3950, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3984, ptr addrspace(8) %3659, i32 %3985, i32 0, i32 0) + %3986 = insertelement <4 x half> poison, half %3964, i64 0 + %3987 = insertelement <4 x half> %3986, half %3965, i64 1 + %3988 = insertelement <4 x half> %3987, half %3966, i64 2 + %3989 = insertelement <4 x half> %3988, half %3967, i64 3 + %3990 = bitcast <4 x half> %3989 to <2 x i32> + %3991 = shl i32 %3951, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3990, ptr addrspace(8) %3659, i32 %3991, i32 0, i32 0) + %3992 = add i32 %3635, %2295 + %3993 = add i32 %3635, %2302 + %3994 = add i32 %3636, %2295 + %3995 = add i32 %3636, %2302 + %3996 = fptrunc float %3430 to half + %3997 = fptrunc float %3431 to half + %3998 = fptrunc float %3432 to half + %3999 = fptrunc float %3433 to half + %4000 = fptrunc float %3436 to half + %4001 = fptrunc float %3437 to half + %4002 = fptrunc float %3438 to half + %4003 = fptrunc float %3439 to half + %4004 = fptrunc float %3442 to half + %4005 = fptrunc float %3443 to half + %4006 = fptrunc float %3444 to half + %4007 = fptrunc float %3445 to half + %4008 = fptrunc float %3448 to half + %4009 = fptrunc float %3449 to half + %4010 = fptrunc float %3450 to half + %4011 = fptrunc float %3451 to half + %4012 = insertelement <4 x half> poison, half %3996, i64 0 + %4013 = insertelement <4 x half> %4012, half %3997, i64 1 + %4014 = insertelement <4 x half> %4013, half %3998, i64 2 + %4015 = insertelement <4 x half> %4014, half %3999, i64 3 + %4016 = bitcast <4 x half> %4015 to <2 x i32> + %4017 = shl i32 %3992, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4016, ptr addrspace(8) %3659, i32 %4017, i32 0, i32 0) + %4018 = insertelement <4 x half> poison, half %4000, i64 0 + %4019 = insertelement <4 x half> %4018, half %4001, i64 1 + %4020 = insertelement <4 x half> %4019, half %4002, i64 2 + %4021 = insertelement <4 x half> %4020, half %4003, i64 3 + %4022 = bitcast <4 x half> %4021 to <2 x i32> + %4023 = shl i32 %3993, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4022, ptr addrspace(8) %3659, i32 %4023, i32 0, i32 0) + %4024 = insertelement <4 x half> poison, half %4004, i64 0 + %4025 = insertelement <4 x half> %4024, half %4005, i64 1 + %4026 = insertelement <4 x half> %4025, half %4006, i64 2 + %4027 = insertelement <4 x half> %4026, half %4007, i64 3 + %4028 = bitcast <4 x half> %4027 to <2 x i32> + %4029 = shl i32 %3994, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4028, ptr addrspace(8) %3659, i32 %4029, i32 0, i32 0) + %4030 = insertelement <4 x half> poison, half %4008, i64 0 + %4031 = insertelement <4 x half> %4030, half %4009, i64 1 + %4032 = insertelement <4 x half> %4031, half %4010, i64 2 + %4033 = insertelement <4 x half> %4032, half %4011, i64 3 + %4034 = bitcast <4 x half> %4033 to <2 x i32> + %4035 = shl i32 %3995, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4034, ptr addrspace(8) %3659, i32 %4035, i32 0, i32 0) + %4036 = add i32 %3635, %2301 + %4037 = add i32 %3635, %2300 + %4038 = add i32 %3636, %2301 + %4039 = add i32 %3636, %2300 + %4040 = fptrunc float %3454 to half + %4041 = fptrunc float %3455 to half + %4042 = fptrunc float %3456 to half + %4043 = fptrunc float %3457 to half + %4044 = fptrunc float %3460 to half + %4045 = fptrunc float %3461 to half + %4046 = fptrunc float %3462 to half + %4047 = fptrunc float %3463 to half + %4048 = fptrunc float %3466 to half + %4049 = fptrunc float %3467 to half + %4050 = fptrunc float %3468 to half + %4051 = fptrunc float %3469 to half + %4052 = fptrunc float %3472 to half + %4053 = fptrunc float %3473 to half + %4054 = fptrunc float %3474 to half + %4055 = fptrunc float %3475 to half + %4056 = insertelement <4 x half> poison, half %4040, i64 0 + %4057 = insertelement <4 x half> %4056, half %4041, i64 1 + %4058 = insertelement <4 x half> %4057, half %4042, i64 2 + %4059 = insertelement <4 x half> %4058, half %4043, i64 3 + %4060 = bitcast <4 x half> %4059 to <2 x i32> + %4061 = shl i32 %4036, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4060, ptr addrspace(8) %3659, i32 %4061, i32 0, i32 0) + %4062 = insertelement <4 x half> poison, half %4044, i64 0 + %4063 = insertelement <4 x half> %4062, half %4045, i64 1 + %4064 = insertelement <4 x half> %4063, half %4046, i64 2 + %4065 = insertelement <4 x half> %4064, half %4047, i64 3 + %4066 = bitcast <4 x half> %4065 to <2 x i32> + %4067 = shl i32 %4037, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4066, ptr addrspace(8) %3659, i32 %4067, i32 0, i32 0) + %4068 = insertelement <4 x half> poison, half %4048, i64 0 + %4069 = insertelement <4 x half> %4068, half %4049, i64 1 + %4070 = insertelement <4 x half> %4069, half %4050, i64 2 + %4071 = insertelement <4 x half> %4070, half %4051, i64 3 + %4072 = bitcast <4 x half> %4071 to <2 x i32> + %4073 = shl i32 %4038, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4072, ptr addrspace(8) %3659, i32 %4073, i32 0, i32 0) + %4074 = insertelement <4 x half> poison, half %4052, i64 0 + %4075 = insertelement <4 x half> %4074, half %4053, i64 1 + %4076 = insertelement <4 x half> %4075, half %4054, i64 2 + %4077 = insertelement <4 x half> %4076, half %4055, i64 3 + %4078 = bitcast <4 x half> %4077 to <2 x i32> + %4079 = shl i32 %4039, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4078, ptr addrspace(8) %3659, i32 %4079, i32 0, i32 0) + %4080 = add i32 %3635, %2299 + %4081 = add i32 %3635, %2298 + %4082 = add i32 %3636, %2299 + %4083 = add i32 %3636, %2298 + %4084 = fptrunc float %3532 to half + %4085 = fptrunc float %3533 to half + %4086 = fptrunc float %3534 to half + %4087 = fptrunc float %3535 to half + %4088 = fptrunc float %3538 to half + %4089 = fptrunc float %3539 to half + %4090 = fptrunc float %3540 to half + %4091 = fptrunc float %3541 to half + %4092 = fptrunc float %3544 to half + %4093 = fptrunc float %3545 to half + %4094 = fptrunc float %3546 to half + %4095 = fptrunc float %3547 to half + %4096 = fptrunc float %3550 to half + %4097 = fptrunc float %3551 to half + %4098 = fptrunc float %3552 to half + %4099 = fptrunc float %3553 to half + %4100 = insertelement <4 x half> poison, half %4084, i64 0 + %4101 = insertelement <4 x half> %4100, half %4085, i64 1 + %4102 = insertelement <4 x half> %4101, half %4086, i64 2 + %4103 = insertelement <4 x half> %4102, half %4087, i64 3 + %4104 = bitcast <4 x half> %4103 to <2 x i32> + %4105 = shl i32 %4080, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4104, ptr addrspace(8) %3659, i32 %4105, i32 0, i32 0) + %4106 = insertelement <4 x half> poison, half %4088, i64 0 + %4107 = insertelement <4 x half> %4106, half %4089, i64 1 + %4108 = insertelement <4 x half> %4107, half %4090, i64 2 + %4109 = insertelement <4 x half> %4108, half %4091, i64 3 + %4110 = bitcast <4 x half> %4109 to <2 x i32> + %4111 = shl i32 %4081, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4110, ptr addrspace(8) %3659, i32 %4111, i32 0, i32 0) + %4112 = insertelement <4 x half> poison, half %4092, i64 0 + %4113 = insertelement <4 x half> %4112, half %4093, i64 1 + %4114 = insertelement <4 x half> %4113, half %4094, i64 2 + %4115 = insertelement <4 x half> %4114, half %4095, i64 3 + %4116 = bitcast <4 x half> %4115 to <2 x i32> + %4117 = shl i32 %4082, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4116, ptr addrspace(8) %3659, i32 %4117, i32 0, i32 0) + %4118 = insertelement <4 x half> poison, half %4096, i64 0 + %4119 = insertelement <4 x half> %4118, half %4097, i64 1 + %4120 = insertelement <4 x half> %4119, half %4098, i64 2 + %4121 = insertelement <4 x half> %4120, half %4099, i64 3 + %4122 = bitcast <4 x half> %4121 to <2 x i32> + %4123 = shl i32 %4083, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4122, ptr addrspace(8) %3659, i32 %4123, i32 0, i32 0) + %4124 = add i32 %3635, %2297 + %4125 = add i32 %3635, %2296 + %4126 = add i32 %3636, %2297 + %4127 = add i32 %3636, %2296 + %4128 = fptrunc float %3556 to half + %4129 = fptrunc float %3557 to half + %4130 = fptrunc float %3558 to half + %4131 = fptrunc float %3559 to half + %4132 = fptrunc float %3562 to half + %4133 = fptrunc float %3563 to half + %4134 = fptrunc float %3564 to half + %4135 = fptrunc float %3565 to half + %4136 = fptrunc float %3568 to half + %4137 = fptrunc float %3569 to half + %4138 = fptrunc float %3570 to half + %4139 = fptrunc float %3571 to half + %4140 = fptrunc float %3574 to half + %4141 = fptrunc float %3575 to half + %4142 = fptrunc float %3576 to half + %4143 = fptrunc float %3577 to half + %4144 = insertelement <4 x half> poison, half %4128, i64 0 + %4145 = insertelement <4 x half> %4144, half %4129, i64 1 + %4146 = insertelement <4 x half> %4145, half %4130, i64 2 + %4147 = insertelement <4 x half> %4146, half %4131, i64 3 + %4148 = bitcast <4 x half> %4147 to <2 x i32> + %4149 = shl i32 %4124, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4148, ptr addrspace(8) %3659, i32 %4149, i32 0, i32 0) + %4150 = insertelement <4 x half> poison, half %4132, i64 0 + %4151 = insertelement <4 x half> %4150, half %4133, i64 1 + %4152 = insertelement <4 x half> %4151, half %4134, i64 2 + %4153 = insertelement <4 x half> %4152, half %4135, i64 3 + %4154 = bitcast <4 x half> %4153 to <2 x i32> + %4155 = shl i32 %4125, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4154, ptr addrspace(8) %3659, i32 %4155, i32 0, i32 0) + %4156 = insertelement <4 x half> poison, half %4136, i64 0 + %4157 = insertelement <4 x half> %4156, half %4137, i64 1 + %4158 = insertelement <4 x half> %4157, half %4138, i64 2 + %4159 = insertelement <4 x half> %4158, half %4139, i64 3 + %4160 = bitcast <4 x half> %4159 to <2 x i32> + %4161 = shl i32 %4126, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4160, ptr addrspace(8) %3659, i32 %4161, i32 0, i32 0) + %4162 = insertelement <4 x half> poison, half %4140, i64 0 + %4163 = insertelement <4 x half> %4162, half %4141, i64 1 + %4164 = insertelement <4 x half> %4163, half %4142, i64 2 + %4165 = insertelement <4 x half> %4164, half %4143, i64 3 + %4166 = bitcast <4 x half> %4165 to <2 x i32> + %4167 = shl i32 %4127, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4166, ptr addrspace(8) %3659, i32 %4167, i32 0, i32 0) + %4168 = add i32 %3637, %2295 + %4169 = add i32 %3637, %2302 + %4170 = add i32 %3638, %2295 + %4171 = add i32 %3638, %2302 + %4172 = fptrunc float %3484 to half + %4173 = fptrunc float %3485 to half + %4174 = fptrunc float %3486 to half + %4175 = fptrunc float %3487 to half + %4176 = fptrunc float %3490 to half + %4177 = fptrunc float %3491 to half + %4178 = fptrunc float %3492 to half + %4179 = fptrunc float %3493 to half + %4180 = fptrunc float %3496 to half + %4181 = fptrunc float %3497 to half + %4182 = fptrunc float %3498 to half + %4183 = fptrunc float %3499 to half + %4184 = fptrunc float %3502 to half + %4185 = fptrunc float %3503 to half + %4186 = fptrunc float %3504 to half + %4187 = fptrunc float %3505 to half + %4188 = insertelement <4 x half> poison, half %4172, i64 0 + %4189 = insertelement <4 x half> %4188, half %4173, i64 1 + %4190 = insertelement <4 x half> %4189, half %4174, i64 2 + %4191 = insertelement <4 x half> %4190, half %4175, i64 3 + %4192 = bitcast <4 x half> %4191 to <2 x i32> + %4193 = shl i32 %4168, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4192, ptr addrspace(8) %3659, i32 %4193, i32 0, i32 0) + %4194 = insertelement <4 x half> poison, half %4176, i64 0 + %4195 = insertelement <4 x half> %4194, half %4177, i64 1 + %4196 = insertelement <4 x half> %4195, half %4178, i64 2 + %4197 = insertelement <4 x half> %4196, half %4179, i64 3 + %4198 = bitcast <4 x half> %4197 to <2 x i32> + %4199 = shl i32 %4169, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4198, ptr addrspace(8) %3659, i32 %4199, i32 0, i32 0) + %4200 = insertelement <4 x half> poison, half %4180, i64 0 + %4201 = insertelement <4 x half> %4200, half %4181, i64 1 + %4202 = insertelement <4 x half> %4201, half %4182, i64 2 + %4203 = insertelement <4 x half> %4202, half %4183, i64 3 + %4204 = bitcast <4 x half> %4203 to <2 x i32> + %4205 = shl i32 %4170, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4204, ptr addrspace(8) %3659, i32 %4205, i32 0, i32 0) + %4206 = insertelement <4 x half> poison, half %4184, i64 0 + %4207 = insertelement <4 x half> %4206, half %4185, i64 1 + %4208 = insertelement <4 x half> %4207, half %4186, i64 2 + %4209 = insertelement <4 x half> %4208, half %4187, i64 3 + %4210 = bitcast <4 x half> %4209 to <2 x i32> + %4211 = shl i32 %4171, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4210, ptr addrspace(8) %3659, i32 %4211, i32 0, i32 0) + %4212 = add i32 %3637, %2301 + %4213 = add i32 %3637, %2300 + %4214 = add i32 %3638, %2301 + %4215 = add i32 %3638, %2300 + %4216 = fptrunc float %3508 to half + %4217 = fptrunc float %3509 to half + %4218 = fptrunc float %3510 to half + %4219 = fptrunc float %3511 to half + %4220 = fptrunc float %3514 to half + %4221 = fptrunc float %3515 to half + %4222 = fptrunc float %3516 to half + %4223 = fptrunc float %3517 to half + %4224 = fptrunc float %3520 to half + %4225 = fptrunc float %3521 to half + %4226 = fptrunc float %3522 to half + %4227 = fptrunc float %3523 to half + %4228 = fptrunc float %3526 to half + %4229 = fptrunc float %3527 to half + %4230 = fptrunc float %3528 to half + %4231 = fptrunc float %3529 to half + %4232 = insertelement <4 x half> poison, half %4216, i64 0 + %4233 = insertelement <4 x half> %4232, half %4217, i64 1 + %4234 = insertelement <4 x half> %4233, half %4218, i64 2 + %4235 = insertelement <4 x half> %4234, half %4219, i64 3 + %4236 = bitcast <4 x half> %4235 to <2 x i32> + %4237 = shl i32 %4212, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4236, ptr addrspace(8) %3659, i32 %4237, i32 0, i32 0) + %4238 = insertelement <4 x half> poison, half %4220, i64 0 + %4239 = insertelement <4 x half> %4238, half %4221, i64 1 + %4240 = insertelement <4 x half> %4239, half %4222, i64 2 + %4241 = insertelement <4 x half> %4240, half %4223, i64 3 + %4242 = bitcast <4 x half> %4241 to <2 x i32> + %4243 = shl i32 %4213, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4242, ptr addrspace(8) %3659, i32 %4243, i32 0, i32 0) + %4244 = insertelement <4 x half> poison, half %4224, i64 0 + %4245 = insertelement <4 x half> %4244, half %4225, i64 1 + %4246 = insertelement <4 x half> %4245, half %4226, i64 2 + %4247 = insertelement <4 x half> %4246, half %4227, i64 3 + %4248 = bitcast <4 x half> %4247 to <2 x i32> + %4249 = shl i32 %4214, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4248, ptr addrspace(8) %3659, i32 %4249, i32 0, i32 0) + %4250 = insertelement <4 x half> poison, half %4228, i64 0 + %4251 = insertelement <4 x half> %4250, half %4229, i64 1 + %4252 = insertelement <4 x half> %4251, half %4230, i64 2 + %4253 = insertelement <4 x half> %4252, half %4231, i64 3 + %4254 = bitcast <4 x half> %4253 to <2 x i32> + %4255 = shl i32 %4215, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4254, ptr addrspace(8) %3659, i32 %4255, i32 0, i32 0) + %4256 = add i32 %3637, %2299 + %4257 = add i32 %3637, %2298 + %4258 = add i32 %3638, %2299 + %4259 = add i32 %3638, %2298 + %4260 = fptrunc float %3580 to half + %4261 = fptrunc float %3581 to half + %4262 = fptrunc float %3582 to half + %4263 = fptrunc float %3583 to half + %4264 = fptrunc float %3586 to half + %4265 = fptrunc float %3587 to half + %4266 = fptrunc float %3588 to half + %4267 = fptrunc float %3589 to half + %4268 = fptrunc float %3592 to half + %4269 = fptrunc float %3593 to half + %4270 = fptrunc float %3594 to half + %4271 = fptrunc float %3595 to half + %4272 = fptrunc float %3598 to half + %4273 = fptrunc float %3599 to half + %4274 = fptrunc float %3600 to half + %4275 = fptrunc float %3601 to half + %4276 = insertelement <4 x half> poison, half %4260, i64 0 + %4277 = insertelement <4 x half> %4276, half %4261, i64 1 + %4278 = insertelement <4 x half> %4277, half %4262, i64 2 + %4279 = insertelement <4 x half> %4278, half %4263, i64 3 + %4280 = bitcast <4 x half> %4279 to <2 x i32> + %4281 = shl i32 %4256, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4280, ptr addrspace(8) %3659, i32 %4281, i32 0, i32 0) + %4282 = insertelement <4 x half> poison, half %4264, i64 0 + %4283 = insertelement <4 x half> %4282, half %4265, i64 1 + %4284 = insertelement <4 x half> %4283, half %4266, i64 2 + %4285 = insertelement <4 x half> %4284, half %4267, i64 3 + %4286 = bitcast <4 x half> %4285 to <2 x i32> + %4287 = shl i32 %4257, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4286, ptr addrspace(8) %3659, i32 %4287, i32 0, i32 0) + %4288 = insertelement <4 x half> poison, half %4268, i64 0 + %4289 = insertelement <4 x half> %4288, half %4269, i64 1 + %4290 = insertelement <4 x half> %4289, half %4270, i64 2 + %4291 = insertelement <4 x half> %4290, half %4271, i64 3 + %4292 = bitcast <4 x half> %4291 to <2 x i32> + %4293 = shl i32 %4258, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4292, ptr addrspace(8) %3659, i32 %4293, i32 0, i32 0) + %4294 = insertelement <4 x half> poison, half %4272, i64 0 + %4295 = insertelement <4 x half> %4294, half %4273, i64 1 + %4296 = insertelement <4 x half> %4295, half %4274, i64 2 + %4297 = insertelement <4 x half> %4296, half %4275, i64 3 + %4298 = bitcast <4 x half> %4297 to <2 x i32> + %4299 = shl i32 %4259, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4298, ptr addrspace(8) %3659, i32 %4299, i32 0, i32 0) + %4300 = add i32 %3637, %2297 + %4301 = add i32 %3637, %2296 + %4302 = add i32 %3638, %2297 + %4303 = add i32 %3638, %2296 + %4304 = fptrunc float %3604 to half + %4305 = fptrunc float %3605 to half + %4306 = fptrunc float %3606 to half + %4307 = fptrunc float %3607 to half + %4308 = fptrunc float %3610 to half + %4309 = fptrunc float %3611 to half + %4310 = fptrunc float %3612 to half + %4311 = fptrunc float %3613 to half + %4312 = fptrunc float %3616 to half + %4313 = fptrunc float %3617 to half + %4314 = fptrunc float %3618 to half + %4315 = fptrunc float %3619 to half + %4316 = fptrunc float %3622 to half + %4317 = fptrunc float %3623 to half + %4318 = fptrunc float %3624 to half + %4319 = fptrunc float %3625 to half + %4320 = insertelement <4 x half> poison, half %4304, i64 0 + %4321 = insertelement <4 x half> %4320, half %4305, i64 1 + %4322 = insertelement <4 x half> %4321, half %4306, i64 2 + %4323 = insertelement <4 x half> %4322, half %4307, i64 3 + %4324 = bitcast <4 x half> %4323 to <2 x i32> + %4325 = shl i32 %4300, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4324, ptr addrspace(8) %3659, i32 %4325, i32 0, i32 0) + %4326 = insertelement <4 x half> poison, half %4308, i64 0 + %4327 = insertelement <4 x half> %4326, half %4309, i64 1 + %4328 = insertelement <4 x half> %4327, half %4310, i64 2 + %4329 = insertelement <4 x half> %4328, half %4311, i64 3 + %4330 = bitcast <4 x half> %4329 to <2 x i32> + %4331 = shl i32 %4301, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4330, ptr addrspace(8) %3659, i32 %4331, i32 0, i32 0) + %4332 = insertelement <4 x half> poison, half %4312, i64 0 + %4333 = insertelement <4 x half> %4332, half %4313, i64 1 + %4334 = insertelement <4 x half> %4333, half %4314, i64 2 + %4335 = insertelement <4 x half> %4334, half %4315, i64 3 + %4336 = bitcast <4 x half> %4335 to <2 x i32> + %4337 = shl i32 %4302, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4336, ptr addrspace(8) %3659, i32 %4337, i32 0, i32 0) + %4338 = insertelement <4 x half> poison, half %4316, i64 0 + %4339 = insertelement <4 x half> %4338, half %4317, i64 1 + %4340 = insertelement <4 x half> %4339, half %4318, i64 2 + %4341 = insertelement <4 x half> %4340, half %4319, i64 3 + %4342 = bitcast <4 x half> %4341 to <2 x i32> + %4343 = shl i32 %4303, 1 + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4342, ptr addrspace(8) %3659, i32 %4343, i32 0, i32 0) + ret void +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workitem.id.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #3 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.s.barrier() #4 + +; Function Attrs: convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) #5 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #6 + +; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn + +attributes #0 = { nofree norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #4 = { convergent mustprogress nocallback nofree nounwind willreturn } +attributes #5 = { convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 1, !"amdhsa_code_object_version", i32 400} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!3 = !DIFile(filename: "", directory: "") +!4 = distinct !DISubprogram(name: "matmul_kernel", linkageName: "matmul_kernel", scope: !3, file: !3, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 222, column: 7, scope: !4) +!8 = !DILocation(line: 224, column: 7, scope: !4) +!9 = !DILocation(line: 225, column: 7, scope: !4) +!10 = !DILocation(line: 230, column: 7, scope: !4) +!11 = !DILocation(line: 231, column: 7, scope: !4) +!12 = !DILocation(line: 232, column: 7, scope: !4) +!13 = !DILocation(line: 233, column: 7, scope: !4) +!14 = !DILocation(line: 234, column: 7, scope: !4) +!15 = !DILocation(line: 235, column: 7, scope: !4) +!16 = !DILocation(line: 236, column: 7, scope: !4) +!17 = !DILocation(line: 237, column: 7, scope: !4) +!18 = !DILocation(line: 238, column: 7, scope: !4) +!19 = !DILocation(line: 241, column: 7, scope: !4) +!20 = !DILocation(line: 242, column: 7, scope: !4) +!21 = !DILocation(line: 247, column: 7, scope: !4) +!22 = !DILocation(line: 249, column: 7, scope: !4) +!23 = !DILocation(line: 250, column: 7, scope: !4) +!24 = !DILocation(line: 255, column: 7, scope: !4) +!25 = !DILocation(line: 256, column: 7, scope: !4) +!26 = !DILocation(line: 257, column: 7, scope: !4) +!27 = !DILocation(line: 258, column: 7, scope: !4) +!28 = !DILocation(line: 259, column: 7, scope: !4) +!29 = !DILocation(line: 260, column: 7, scope: !4) +!30 = !DILocation(line: 261, column: 7, scope: !4) +!31 = !DILocation(line: 262, column: 7, scope: !4) +!32 = !DILocation(line: 263, column: 7, scope: !4) +!33 = !DILocation(line: 266, column: 7, scope: !4) +!34 = !DILocation(line: 267, column: 7, scope: !4) +!35 = !DILocation(line: 275, column: 7, scope: !4) +!36 = !DILocation(line: 280, column: 7, scope: !4) +!37 = !DILocation(line: 285, column: 7, scope: !4) +!38 = !DILocation(line: 286, column: 7, scope: !4) +!39 = !DILocation(line: 287, column: 7, scope: !4) +!40 = !DILocation(line: 288, column: 7, scope: !4) +!41 = !DILocation(line: 289, column: 7, scope: !4) +!42 = !DILocation(line: 290, column: 7, scope: !4) +!43 = !DILocation(line: 291, column: 7, scope: !4) +!44 = !DILocation(line: 292, column: 7, scope: !4) +!45 = !DILocation(line: 293, column: 7, scope: !4) +!46 = !DILocation(line: 298, column: 7, scope: !4) +!47 = !DILocation(line: 299, column: 7, scope: !4) +!48 = !DILocation(line: 300, column: 7, scope: !4) +!49 = !DILocation(line: 301, column: 7, scope: !4) +!50 = !DILocation(line: 302, column: 7, scope: !4) +!51 = !DILocation(line: 303, column: 7, scope: !4) +!52 = !DILocation(line: 304, column: 7, scope: !4) +!53 = !DILocation(line: 305, column: 7, scope: !4) +!54 = !DILocation(line: 306, column: 7, scope: !4) +!55 = !DILocation(line: 309, column: 7, scope: !4) +!56 = !DILocation(line: 310, column: 7, scope: !4) +!57 = !DILocation(line: 315, column: 7, scope: !4) +!58 = !DILocation(line: 316, column: 7, scope: !4) +!59 = !DILocation(line: 321, column: 7, scope: !4) +!60 = !DILocation(line: 322, column: 7, scope: !4) +!61 = !DILocation(line: 323, column: 7, scope: !4) +!62 = !DILocation(line: 324, column: 7, scope: !4) +!63 = !DILocation(line: 325, column: 7, scope: !4) +!64 = !DILocation(line: 326, column: 7, scope: !4) +!65 = !DILocation(line: 327, column: 7, scope: !4) +!66 = !DILocation(line: 328, column: 7, scope: !4) +!67 = !DILocation(line: 329, column: 7, scope: !4) +!68 = !DILocation(line: 334, column: 7, scope: !4) +!69 = !DILocation(line: 335, column: 7, scope: !4) +!70 = !DILocation(line: 336, column: 7, scope: !4) +!71 = !DILocation(line: 337, column: 7, scope: !4) +!72 = !DILocation(line: 338, column: 7, scope: !4) +!73 = !DILocation(line: 339, column: 7, scope: !4) +!74 = !DILocation(line: 340, column: 7, scope: !4) +!75 = !DILocation(line: 341, column: 7, scope: !4) +!76 = !DILocation(line: 342, column: 7, scope: !4) +!77 = !DILocation(line: 347, column: 7, scope: !4) +!78 = !DILocation(line: 348, column: 7, scope: !4) +!79 = !DILocation(line: 364, column: 7, scope: !4) +!80 = !DILocation(line: 365, column: 7, scope: !4) +!81 = !DILocation(line: 366, column: 7, scope: !4) +!82 = !DILocation(line: 367, column: 7, scope: !4) +!83 = !DILocation(line: 368, column: 7, scope: !4) +!84 = !DILocation(line: 369, column: 7, scope: !4) +!85 = !DILocation(line: 370, column: 7, scope: !4) +!86 = !DILocation(line: 371, column: 7, scope: !4) +!87 = !DILocation(line: 372, column: 7, scope: !4) +!88 = !DILocation(line: 373, column: 7, scope: !4) +!89 = !DILocation(line: 374, column: 7, scope: !4) +!90 = !DILocation(line: 375, column: 7, scope: !4) +!91 = !DILocation(line: 376, column: 7, scope: !4) +!92 = !DILocation(line: 377, column: 7, scope: !4) +!93 = !DILocation(line: 378, column: 7, scope: !4) +!94 = !DILocation(line: 379, column: 7, scope: !4) +!95 = !DILocation(line: 380, column: 7, scope: !4) +!96 = !DILocation(line: 381, column: 7, scope: !4) +!97 = !DILocation(line: 382, column: 7, scope: !4) +!98 = !DILocation(line: 383, column: 7, scope: !4) +!99 = !DILocation(line: 384, column: 7, scope: !4) +!100 = !DILocation(line: 385, column: 7, scope: !4) +!101 = !DILocation(line: 386, column: 7, scope: !4) +!102 = !DILocation(line: 387, column: 7, scope: !4) +!103 = !DILocation(line: 388, column: 7, scope: !4) +!104 = !DILocation(line: 389, column: 7, scope: !4) +!105 = !DILocation(line: 390, column: 7, scope: !4) +!106 = !DILocation(line: 391, column: 7, scope: !4) +!107 = !DILocation(line: 392, column: 7, scope: !4) +!108 = !DILocation(line: 393, column: 7, scope: !4) +!109 = !DILocation(line: 394, column: 7, scope: !4) +!110 = !DILocation(line: 395, column: 7, scope: !4) +!111 = !DILocation(line: 396, column: 7, scope: !4) +!112 = !DILocation(line: 412, column: 7, scope: !4) +!113 = !DILocation(line: 413, column: 7, scope: !4) +!114 = !DILocation(line: 414, column: 7, scope: !4) +!115 = !DILocation(line: 415, column: 7, scope: !4) +!116 = !DILocation(line: 416, column: 7, scope: !4) +!117 = !DILocation(line: 417, column: 7, scope: !4) +!118 = !DILocation(line: 418, column: 7, scope: !4) +!119 = !DILocation(line: 419, column: 7, scope: !4) +!120 = !DILocation(line: 420, column: 7, scope: !4) +!121 = !DILocation(line: 421, column: 7, scope: !4) +!122 = !DILocation(line: 422, column: 7, scope: !4) +!123 = !DILocation(line: 423, column: 7, scope: !4) +!124 = !DILocation(line: 424, column: 7, scope: !4) +!125 = !DILocation(line: 425, column: 7, scope: !4) +!126 = !DILocation(line: 426, column: 7, scope: !4) +!127 = !DILocation(line: 427, column: 7, scope: !4) +!128 = !DILocation(line: 428, column: 7, scope: !4) +!129 = !DILocation(line: 429, column: 7, scope: !4) +!130 = !DILocation(line: 430, column: 7, scope: !4) +!131 = !DILocation(line: 431, column: 7, scope: !4) +!132 = !DILocation(line: 432, column: 7, scope: !4) +!133 = !DILocation(line: 433, column: 7, scope: !4) +!134 = !DILocation(line: 434, column: 7, scope: !4) +!135 = !DILocation(line: 435, column: 7, scope: !4) +!136 = !DILocation(line: 436, column: 7, scope: !4) +!137 = !DILocation(line: 437, column: 7, scope: !4) +!138 = !DILocation(line: 438, column: 7, scope: !4) +!139 = !DILocation(line: 439, column: 7, scope: !4) +!140 = !DILocation(line: 440, column: 7, scope: !4) +!141 = !DILocation(line: 441, column: 7, scope: !4) +!142 = !DILocation(line: 442, column: 7, scope: !4) +!143 = !DILocation(line: 443, column: 7, scope: !4) +!144 = !DILocation(line: 444, column: 7, scope: !4) +!145 = !DILocation(line: 449, column: 7, scope: !4) +!146 = !DILocation(line: 450, column: 7, scope: !4) +!147 = !DILocation(line: 456, column: 7, scope: !4) +!148 = !DILocation(line: 457, column: 7, scope: !4) +!149 = !DILocation(line: 458, column: 7, scope: !4) +!150 = !DILocation(line: 459, column: 7, scope: !4) +!151 = !DILocation(line: 460, column: 7, scope: !4) +!152 = !DILocation(line: 461, column: 7, scope: !4) +!153 = !DILocation(line: 462, column: 7, scope: !4) +!154 = !DILocation(line: 463, column: 7, scope: !4) +!155 = !DILocation(line: 464, column: 7, scope: !4) +!156 = !DILocation(line: 469, column: 7, scope: !4) +!157 = !DILocation(line: 470, column: 7, scope: !4) +!158 = !DILocation(line: 471, column: 7, scope: !4) +!159 = !DILocation(line: 472, column: 7, scope: !4) +!160 = !DILocation(line: 473, column: 7, scope: !4) +!161 = !DILocation(line: 474, column: 7, scope: !4) +!162 = !DILocation(line: 475, column: 7, scope: !4) +!163 = !DILocation(line: 476, column: 7, scope: !4) +!164 = !DILocation(line: 477, column: 7, scope: !4) +!165 = !DILocation(line: 480, column: 7, scope: !4) +!166 = !DILocation(line: 481, column: 7, scope: !4) +!167 = !DILocation(line: 166, column: 9, scope: !4) +!168 = !DILocation(line: 174, column: 9, scope: !4) +!169 = !DILocation(line: 175, column: 9, scope: !4) +!170 = !DILocation(line: 176, column: 9, scope: !4) +!171 = !DILocation(line: 177, column: 9, scope: !4) +!172 = !DILocation(line: 178, column: 9, scope: !4) +!173 = !DILocation(line: 179, column: 9, scope: !4) +!174 = !DILocation(line: 180, column: 9, scope: !4) +!175 = !DILocation(line: 181, column: 9, scope: !4) +!176 = !DILocation(line: 182, column: 9, scope: !4) +!177 = !DILocation(line: 183, column: 9, scope: !4) +!178 = !DILocation(line: 184, column: 9, scope: !4) +!179 = !DILocation(line: 185, column: 9, scope: !4) +!180 = !DILocation(line: 186, column: 9, scope: !4) +!181 = !DILocation(line: 187, column: 9, scope: !4) +!182 = !DILocation(line: 188, column: 9, scope: !4) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll new file mode 100644 index 0000000000000..74edd5a7a227b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_iglp_opt_mfma_gemm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 +; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: ; iglp_opt mask(0x00000000) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 +; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(4) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v1 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.iglp.opt(i32 4) + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ret void +}